From ba834301849747d884e47ec6176804a69952bea4 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 14:16:40 -0700 Subject: [PATCH 1/7] openvmm_core: add GIC SPI layout allocator Introduce a deterministic SPI layout resolver for aarch64 VMs, analogous to the memory layout engine. All GIC SPI assignments for platform devices (GICv2m MSI block) are computed in a single top-down pass over the SPI range [64, 1019], ensuring the layout is a pure function of the VM config. This is critical for hibernation stability. The BuildTopology trait is replaced with cfg-gated free functions, and the GicMsiConfig::V2m variant now carries an explicit spi_count field so the user can control the v2m block size. The DEFAULT_GIC_V2M_SPI_BASE constant is removed since the allocator picks the base dynamically. This is not very interesting yet, but it will become more interesting when we add vSMMUs (which need SPIs). --- openvmm/openvmm_core/src/worker/dispatch.rs | 329 +++++++++--------- openvmm/openvmm_core/src/worker/mod.rs | 1 + openvmm/openvmm_core/src/worker/spi_layout.rs | 114 ++++++ openvmm/openvmm_defs/src/config.rs | 10 +- openvmm/openvmm_entry/src/lib.rs | 4 +- 5 files changed, 294 insertions(+), 164 deletions(-) create mode 100644 openvmm/openvmm_core/src/worker/spi_layout.rs diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 4cbe51c1b8..65557ed06b 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -34,7 +34,6 @@ use ide_resources::IdeDeviceConfig; use igvm::IgvmFile; use input_core::InputData; use input_core::MultiplexedInputHandle; -use inspect::Inspect; use local_clock::LocalClockDelta; use membacking::GuestMemoryBuilder; use membacking::GuestMemoryManager; @@ -113,7 +112,6 @@ use vm_resource::kind::VirtioDeviceHandle; use vm_resource::kind::VmbusDeviceHandleKind; use vm_topology::memory::MemoryLayout; use vm_topology::pcie::PcieHostBridge; -use vm_topology::processor::ArchTopology; use vm_topology::processor::ProcessorTopology; use vm_topology::processor::TopologyBuilder; use vm_topology::processor::aarch64::Aarch64Topology; @@ -414,13 +412,6 @@ pub(crate) struct InitializedVm { driver_source: VmTaskDriverSource, } -trait BuildTopology { - fn to_topology( - &self, - platform_info: &virt::PlatformInfo, - ) -> anyhow::Result>; -} - trait ExtractTopologyConfig { fn to_config(&self) -> ProcessorTopologyConfig; } @@ -446,38 +437,35 @@ impl ExtractTopologyConfig for ProcessorTopology { } #[cfg(guest_arch = "x86_64")] -impl BuildTopology for ProcessorTopologyConfig { - fn to_topology( - &self, - _platform_info: &virt::PlatformInfo, - ) -> anyhow::Result> { - use vm_topology::processor::x86::X2ApicState; - - let arch = match &self.arch { - None => Default::default(), - Some(ArchTopologyConfig::X86(arch)) => arch.clone(), - _ => anyhow::bail!("invalid architecture config"), - }; - let mut builder = TopologyBuilder::from_host_topology()?; - builder.apic_id_offset(arch.apic_id_offset); - if let Some(smt) = self.enable_smt { - builder.smt_enabled(smt); - } - if let Some(count) = self.vps_per_socket { - builder.vps_per_socket(count); - } - let x2apic = match arch.x2apic { - X2ApicConfig::Auto => { - // FUTURE: query the hypervisor for a recommendation. - X2ApicState::Supported - } - X2ApicConfig::Supported => X2ApicState::Supported, - X2ApicConfig::Unsupported => X2ApicState::Unsupported, - X2ApicConfig::Enabled => X2ApicState::Enabled, - }; - builder.x2apic(x2apic); - Ok(builder.build(self.proc_count)?) +fn build_x86_topology( + config: &ProcessorTopologyConfig, +) -> anyhow::Result> { + use vm_topology::processor::x86::X2ApicState; + + let arch = match &config.arch { + None => Default::default(), + Some(ArchTopologyConfig::X86(arch)) => arch.clone(), + _ => anyhow::bail!("invalid architecture config"), + }; + let mut builder = TopologyBuilder::from_host_topology()?; + builder.apic_id_offset(arch.apic_id_offset); + if let Some(smt) = config.enable_smt { + builder.smt_enabled(smt); } + if let Some(count) = config.vps_per_socket { + builder.vps_per_socket(count); + } + let x2apic = match arch.x2apic { + X2ApicConfig::Auto => { + // FUTURE: query the hypervisor for a recommendation. + X2ApicState::Supported + } + X2ApicConfig::Supported => X2ApicState::Supported, + X2ApicConfig::Unsupported => X2ApicState::Unsupported, + X2ApicConfig::Enabled => X2ApicState::Enabled, + }; + builder.x2apic(x2apic); + Ok(builder.build(config.proc_count)?) } impl ExtractTopologyConfig for ProcessorTopology { @@ -512,140 +500,107 @@ impl ExtractTopologyConfig for ProcessorTopology { } #[cfg(guest_arch = "aarch64")] -impl BuildTopology for ProcessorTopologyConfig { - fn to_topology( - &self, - platform_info: &virt::PlatformInfo, - ) -> anyhow::Result> { - use vm_topology::processor::aarch64::Aarch64PlatformConfig; - use vm_topology::processor::aarch64::GicItsInfo; - use vm_topology::processor::aarch64::GicMsiController; - use vm_topology::processor::aarch64::GicV2mInfo; - - let arch = match &self.arch { - None => Default::default(), - Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(), - _ => anyhow::bail!("invalid architecture config"), - }; +fn build_aarch64_topology( + config: &ProcessorTopologyConfig, + platform_info: &virt::PlatformInfo, + gic_msi: vm_topology::processor::aarch64::GicMsiController, +) -> anyhow::Result> { + use vm_topology::processor::aarch64::Aarch64PlatformConfig; + + let arch = match &config.arch { + None => Default::default(), + Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(), + _ => anyhow::bail!("invalid architecture config"), + }; - let pmu_gsiv = match arch.pmu_gsiv { - PmuGsivConfig::Disabled => None, - PmuGsivConfig::Gsiv(gsiv) => Some(gsiv), - PmuGsivConfig::Platform => platform_info.platform_gsiv, - }; + let pmu_gsiv = match arch.pmu_gsiv { + PmuGsivConfig::Disabled => None, + PmuGsivConfig::Gsiv(gsiv) => Some(gsiv), + PmuGsivConfig::Platform => platform_info.platform_gsiv, + }; - // TODO: When this value is supported on all platforms, we should change - // the arch config to not be an option. For now, warn since the ARM VBSA - // expects this to be available. - if pmu_gsiv.is_none() { - tracing::warn!("PMU GSIV is not set"); - } + // TODO: When this value is supported on all platforms, we should change + // the arch config to not be an option. For now, warn since the ARM VBSA + // expects this to be available. + if pmu_gsiv.is_none() { + tracing::warn!("PMU GSIV is not set"); + } - let (gic_distributor_base, gic_version) = match &arch.gic_config { - Some(GicConfig::V3(config)) => { - let dist = config - .as_ref() - .map(|c| c.gic_distributor_base) - .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE); - let redist = config - .as_ref() - .map(|c| c.gic_redistributors_base) - .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE); + let (gic_distributor_base, gic_version) = match &arch.gic_config { + Some(GicConfig::V3(config)) => { + let dist = config + .as_ref() + .map(|c| c.gic_distributor_base) + .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE); + let redist = config + .as_ref() + .map(|c| c.gic_redistributors_base) + .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE); + ( + dist, + GicVersion::V3 { + redistributors_base: redist, + }, + ) + } + Some(GicConfig::V2(config)) => { + let dist = config + .as_ref() + .map(|c| c.gic_distributor_base) + .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE); + let cpu_if = config + .as_ref() + .map(|c| c.cpu_interface_base) + .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE); + ( + dist, + GicVersion::V2 { + cpu_interface_base: cpu_if, + }, + ) + } + None => { + // No explicit GIC config — use the hypervisor's detected version + // with default addresses. + let dist = openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE; + let second = openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE; + if platform_info.supports_gic_v3 { ( dist, GicVersion::V3 { - redistributors_base: redist, + redistributors_base: second, }, ) - } - Some(GicConfig::V2(config)) => { - let dist = config - .as_ref() - .map(|c| c.gic_distributor_base) - .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE); - let cpu_if = config - .as_ref() - .map(|c| c.cpu_interface_base) - .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE); + } else { ( dist, GicVersion::V2 { - cpu_interface_base: cpu_if, + cpu_interface_base: second, }, ) } - None => { - // No explicit GIC config — use the hypervisor's detected version - // with default addresses. - let dist = openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE; - let second = openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE; - if platform_info.supports_gic_v3 { - ( - dist, - GicVersion::V3 { - redistributors_base: second, - }, - ) - } else { - ( - dist, - GicVersion::V2 { - cpu_interface_base: second, - }, - ) - } - } - }; - - // Use the ITS for MSI delivery when the backend supports it - // (KVM with GICv3). Otherwise fall back to GICv2m (SPI-based MSIs). - use openvmm_defs::config::GicMsiConfig; - let is_gicv2 = matches!(gic_version, GicVersion::V2 { .. }); - let use_its = match arch.gic_msi { - GicMsiConfig::Auto => platform_info.supports_its && !is_gicv2, - GicMsiConfig::Its => { - if is_gicv2 { - anyhow::bail!("ITS is incompatible with GICv2"); - } - if !platform_info.supports_its { - anyhow::bail!("ITS requested but the hypervisor does not support it"); - } - true - } - GicMsiConfig::V2m => false, - }; - let gic_msi = if use_its { - GicMsiController::Its(GicItsInfo { - its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, - }) - } else { - GicMsiController::V2m(GicV2mInfo { - frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, - spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE, - spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT, - }) - }; + } + }; - let platform = Aarch64PlatformConfig { - gic_distributor_base, - gic_version, - gic_msi, - pmu_gsiv, - virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI, - gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS, - }; + let platform = Aarch64PlatformConfig { + gic_distributor_base, + gic_version, + gic_msi, + pmu_gsiv, + virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI, + gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS, + }; - let mut builder = TopologyBuilder::new_aarch64(platform); - if let Some(smt) = self.enable_smt { - builder.smt_enabled(smt); - } - if let Some(count) = self.vps_per_socket { - builder.vps_per_socket(count); - } else { - builder.vps_per_socket(self.proc_count); - } - Ok(builder.build(self.proc_count)?) + let mut builder = TopologyBuilder::new_aarch64(platform); + if let Some(smt) = config.enable_smt { + builder.smt_enabled(smt); + } + if let Some(count) = config.vps_per_socket { + builder.vps_per_socket(count); + } else { + builder.vps_per_socket(config.proc_count); } + Ok(builder.build(config.proc_count)?) } /// A VM that has been loaded and can be run. @@ -818,6 +773,7 @@ impl InitializedVm { pub(crate) async fn new_with_hypervisor( driver_source: VmTaskDriverSource, hypervisor: &mut H, + #[cfg_attr(not(guest_arch = "aarch64"), expect(unused_variables))] platform_info: virt::PlatformInfo, cfg: Manifest, shared_memory: Option, @@ -865,7 +821,64 @@ impl InitializedVm { None }; - let processor_topology = cfg.processor_topology.to_topology(&platform_info)?; + cfg_if! { + if #[cfg(guest_arch = "aarch64")] { + use openvmm_defs::config::GicMsiConfig; + use vm_topology::processor::aarch64::GicItsInfo; + use vm_topology::processor::aarch64::GicMsiController; + use vm_topology::processor::aarch64::GicV2mInfo; + + // Resolve ITS vs v2m and determine v2m SPI count. + let arch_config = match &cfg.processor_topology.arch { + Some(ArchTopologyConfig::Aarch64(a)) => a, + _ => &Aarch64TopologyConfig::default(), + }; + let is_gicv2 = match &arch_config.gic_config { + Some(GicConfig::V2(_)) => true, + _ => !platform_info.supports_gic_v3, + }; + let v2m_spi_count = match &arch_config.gic_msi { + GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None, + GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT), + GicMsiConfig::Its => { + if is_gicv2 { + anyhow::bail!("ITS is incompatible with GICv2"); + } + if !platform_info.supports_its { + anyhow::bail!("ITS requested but the hypervisor does not support it"); + } + None + } + GicMsiConfig::V2m { spi_count } => Some(*spi_count), + }; + + // Resolve SPI layout — all SPI allocations in one deterministic pass. + let spi_layout = super::spi_layout::resolve_spi_layout( + &super::spi_layout::SpiLayoutInput { + v2m_spi_count, + }, + )?; + + // Build the GIC MSI controller from resolved SPIs. + let gic_msi = if let Some(count) = v2m_spi_count { + GicMsiController::V2m(GicV2mInfo { + frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, + spi_base: spi_layout.v2m_spi_base.expect("v2m base must be allocated when v2m_spi_count is Some"), + spi_count: count, + }) + } else { + GicMsiController::Its(GicItsInfo { + its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, + }) + }; + + let processor_topology = + build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)?; + } else { + let processor_topology = + build_x86_topology(&cfg.processor_topology)?; + } + } let proto = hypervisor .new_partition(virt::ProtoPartitionConfig { diff --git a/openvmm/openvmm_core/src/worker/mod.rs b/openvmm/openvmm_core/src/worker/mod.rs index b36faa2154..3df33c74e3 100644 --- a/openvmm/openvmm_core/src/worker/mod.rs +++ b/openvmm/openvmm_core/src/worker/mod.rs @@ -4,4 +4,5 @@ pub mod dispatch; mod memory_layout; mod rom; +mod spi_layout; pub mod vm_loaders; diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs new file mode 100644 index 0000000000..97065590e8 --- /dev/null +++ b/openvmm/openvmm_core/src/worker/spi_layout.rs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#![cfg(guest_arch = "aarch64")] + +//! GIC SPI layout resolver for aarch64 VMs. +//! +//! This module determines the GIC SPI assignments for all platform devices +//! that need dynamically allocated interrupts. It is the SPI analogue of +//! [`super::memory_layout`]: all allocations happen in a single deterministic +//! pass so that the assignments are a pure function of the VM configuration. +//! This is critical for hibernation — a resumed VM must get the same SPI +//! layout as the original. +//! +//! SPIs are allocated top-down from INTID 1019. This maximizes distance from +//! the guest-side vPCI MSI allocator (Hyper-V PCI driver in Linux), which +//! allocates bottom-up starting at INTID 64. + +/// Top-down GIC SPI allocator. +struct SpiAllocator { + range_start: u32, + cursor: u32, +} + +impl SpiAllocator { + fn new(range: std::ops::RangeInclusive) -> Self { + Self { + range_start: *range.start(), + cursor: *range.end(), + } + } + + /// Allocates a single SPI, returning its GIC INTID. + fn alloc(&mut self, tag: &str) -> anyhow::Result { + if self.cursor < self.range_start { + anyhow::bail!("SPI exhausted allocating {tag}"); + } + let intid = self.cursor; + self.cursor -= 1; + Ok(intid) + } + + /// Allocates a contiguous block of `count` SPIs, returning the lowest + /// GIC INTID in the block. + fn alloc_block(&mut self, tag: &str, count: u32) -> anyhow::Result { + let available = self.cursor.saturating_sub(self.range_start) + 1; + if count == 0 || count > available { + anyhow::bail!( + "SPI exhausted allocating {tag}: need {count}, only {available} remaining" + ); + } + let base = self.cursor - count + 1; + self.cursor = base - 1; + Ok(base) + } +} + +/// Inputs to the SPI layout resolver. +pub(super) struct SpiLayoutInput { + /// Number of SPIs to reserve for GICv2m MSI delivery. `None` when using + /// ITS (no v2m block needed). + pub v2m_spi_count: Option, +} + +/// Resolved SPI assignments for all platform devices. +pub(super) struct ResolvedSpiLayout { + /// GICv2m SPI base INTID. `None` when using ITS. + pub v2m_spi_base: Option, +} + +/// Resolves SPI assignments for all platform devices. +/// +/// All allocations happen here in a single top-down pass over the SPI range +/// `[64, 1019]`. The order of allocations determines the layout and must not +/// change across OpenVMM versions for a given config, or hibernation will +/// break. +pub(super) fn resolve_spi_layout(input: &SpiLayoutInput) -> anyhow::Result { + let mut spi = SpiAllocator::new(64..=1019); + + // --- Allocation order (do not reorder!) --- + + // 1. GICv2m MSI block. + let v2m_spi_base = input + .v2m_spi_count + .map(|count| spi.alloc_block("gicv2m", count)) + .transpose()?; + + Ok(ResolvedSpiLayout { v2m_spi_base }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn v2m_allocation() { + let result = resolve_spi_layout(&SpiLayoutInput { + v2m_spi_count: Some(64), + }) + .unwrap(); + + assert_eq!(result.v2m_spi_base, Some(956)); + } + + #[test] + fn its_skips_v2m() { + let result = resolve_spi_layout(&SpiLayoutInput { + v2m_spi_count: None, + }) + .unwrap(); + + assert_eq!(result.v2m_spi_base, None); + } +} diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 4e4af4dd4b..c2144a8216 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -80,10 +80,7 @@ pub const DEFAULT_GIC_V2M_MSI_FRAME_BASE: u64 = 0xEFFE_8000; /// Size of the v2m MSI frame (one 4KB page is the architectural minimum). pub const GIC_V2M_MSI_FRAME_SIZE: u64 = 0x1000; -/// First GIC interrupt ID reserved for PCIe MSIs via the v2m frame. -/// Must be in the SPI range (32–1019) and not conflict with other devices. -pub const DEFAULT_GIC_V2M_SPI_BASE: u32 = 512; -/// Number of SPIs reserved for PCIe MSIs. +/// Default number of SPIs reserved for PCIe MSIs when using GICv2m. pub const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64; /// Base address of the GICv3 ITS MMIO region. Must be 64 KiB aligned, @@ -296,7 +293,10 @@ pub enum GicMsiConfig { /// Force GICv3 ITS for MSI delivery via LPIs. Its, /// Force GICv2m for MSI delivery via SPIs. - V2m, + V2m { + /// Number of SPIs to reserve for PCIe MSIs. + spi_count: u32, + }, } #[derive(Debug, Protobuf, Default, Clone)] diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 6afbc14913..542253cb3f 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -1285,7 +1285,9 @@ async fn vm_config_from_command_line( gic_msi: match opt.gic_msi { cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto, cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its, - cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m, + cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m { + spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT, + }, }, }, ); From 8e7312c173b47961c341c9d5cb0b9ce1b0ec5fb7 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 14:24:23 -0700 Subject: [PATCH 2/7] spi_layout: remove cfg_if, use #[cfg] attributes --- openvmm/openvmm_core/src/worker/dispatch.rs | 106 ++++++++++---------- 1 file changed, 52 insertions(+), 54 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 65557ed06b..7601fe164f 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -821,64 +821,62 @@ impl InitializedVm { None }; - cfg_if! { - if #[cfg(guest_arch = "aarch64")] { - use openvmm_defs::config::GicMsiConfig; - use vm_topology::processor::aarch64::GicItsInfo; - use vm_topology::processor::aarch64::GicMsiController; - use vm_topology::processor::aarch64::GicV2mInfo; - - // Resolve ITS vs v2m and determine v2m SPI count. - let arch_config = match &cfg.processor_topology.arch { - Some(ArchTopologyConfig::Aarch64(a)) => a, - _ => &Aarch64TopologyConfig::default(), - }; - let is_gicv2 = match &arch_config.gic_config { - Some(GicConfig::V2(_)) => true, - _ => !platform_info.supports_gic_v3, - }; - let v2m_spi_count = match &arch_config.gic_msi { - GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None, - GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT), - GicMsiConfig::Its => { - if is_gicv2 { - anyhow::bail!("ITS is incompatible with GICv2"); - } - if !platform_info.supports_its { - anyhow::bail!("ITS requested but the hypervisor does not support it"); - } - None + #[cfg(guest_arch = "aarch64")] + let processor_topology = { + use openvmm_defs::config::GicMsiConfig; + use vm_topology::processor::aarch64::GicItsInfo; + use vm_topology::processor::aarch64::GicMsiController; + use vm_topology::processor::aarch64::GicV2mInfo; + + // Resolve ITS vs v2m and determine v2m SPI count. + let arch_config = match &cfg.processor_topology.arch { + Some(ArchTopologyConfig::Aarch64(a)) => a, + _ => &Aarch64TopologyConfig::default(), + }; + let is_gicv2 = match &arch_config.gic_config { + Some(GicConfig::V2(_)) => true, + _ => !platform_info.supports_gic_v3, + }; + let v2m_spi_count = match &arch_config.gic_msi { + GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None, + GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT), + GicMsiConfig::Its => { + if is_gicv2 { + anyhow::bail!("ITS is incompatible with GICv2"); } - GicMsiConfig::V2m { spi_count } => Some(*spi_count), - }; - - // Resolve SPI layout — all SPI allocations in one deterministic pass. - let spi_layout = super::spi_layout::resolve_spi_layout( - &super::spi_layout::SpiLayoutInput { - v2m_spi_count, - }, - )?; + if !platform_info.supports_its { + anyhow::bail!("ITS requested but the hypervisor does not support it"); + } + None + } + GicMsiConfig::V2m { spi_count } => Some(*spi_count), + }; - // Build the GIC MSI controller from resolved SPIs. - let gic_msi = if let Some(count) = v2m_spi_count { - GicMsiController::V2m(GicV2mInfo { - frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, - spi_base: spi_layout.v2m_spi_base.expect("v2m base must be allocated when v2m_spi_count is Some"), - spi_count: count, - }) - } else { - GicMsiController::Its(GicItsInfo { - its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, - }) - }; + // Resolve SPI layout — all SPI allocations in one deterministic pass. + let spi_layout = + super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput { + v2m_spi_count, + })?; - let processor_topology = - build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)?; + // Build the GIC MSI controller from resolved SPIs. + let gic_msi = if let Some(count) = v2m_spi_count { + GicMsiController::V2m(GicV2mInfo { + frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, + spi_base: spi_layout + .v2m_spi_base + .expect("v2m base must be allocated when v2m_spi_count is Some"), + spi_count: count, + }) } else { - let processor_topology = - build_x86_topology(&cfg.processor_topology)?; - } - } + GicMsiController::Its(GicItsInfo { + its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, + }) + }; + + build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)? + }; + #[cfg(not(guest_arch = "aarch64"))] + let processor_topology = build_x86_topology(&cfg.processor_topology)?; let proto = hypervisor .new_partition(virt::ProtoPartitionConfig { From 358071a96d3e3c9ae1469b4108f6beacaed401e5 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 14:51:55 -0700 Subject: [PATCH 3/7] feedback --- openvmm/openvmm_core/src/worker/dispatch.rs | 111 +++++++++--------- openvmm/openvmm_core/src/worker/spi_layout.rs | 1 + openvmm/openvmm_defs/src/config.rs | 8 +- openvmm/openvmm_entry/src/lib.rs | 6 +- 4 files changed, 64 insertions(+), 62 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 7601fe164f..1e9981e57c 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -499,13 +499,25 @@ impl ExtractTopologyConfig for ProcessorTopology { } } +#[cfg(guest_arch = "aarch64")] +struct Aarch64TopologyResult { + processor_topology: ProcessorTopology, + #[expect(dead_code)] // consumed by SMMU device wiring + spi_layout: super::spi_layout::ResolvedSpiLayout, +} + #[cfg(guest_arch = "aarch64")] fn build_aarch64_topology( config: &ProcessorTopologyConfig, platform_info: &virt::PlatformInfo, - gic_msi: vm_topology::processor::aarch64::GicMsiController, -) -> anyhow::Result> { +) -> anyhow::Result { + use openvmm_defs::config::GicMsiConfig; use vm_topology::processor::aarch64::Aarch64PlatformConfig; + use vm_topology::processor::aarch64::GicItsInfo; + use vm_topology::processor::aarch64::GicMsiController; + use vm_topology::processor::aarch64::GicV2mInfo; + + const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64; let arch = match &config.arch { None => Default::default(), @@ -582,6 +594,43 @@ fn build_aarch64_topology( } }; + // Resolve ITS vs v2m and determine v2m SPI count. + let is_gicv2 = matches!(gic_version, GicVersion::V2 { .. }); + let v2m_spi_count = match &arch.gic_msi { + GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None, + GicMsiConfig::Auto => Some(DEFAULT_GIC_V2M_SPI_COUNT), + GicMsiConfig::Its => { + if is_gicv2 { + anyhow::bail!("ITS is incompatible with GICv2"); + } + if !platform_info.supports_its { + anyhow::bail!("ITS requested but the hypervisor does not support it"); + } + None + } + GicMsiConfig::V2m { spi_count } => Some(spi_count.unwrap_or(DEFAULT_GIC_V2M_SPI_COUNT)), + }; + + // Resolve SPI layout — all SPI allocations in one deterministic pass. + let spi_layout = super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput { + v2m_spi_count, + })?; + + // Build the GIC MSI controller from resolved SPIs. + let gic_msi = if let Some(count) = v2m_spi_count { + GicMsiController::V2m(GicV2mInfo { + frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, + spi_base: spi_layout + .v2m_spi_base + .expect("v2m base must be allocated when v2m_spi_count is Some"), + spi_count: count, + }) + } else { + GicMsiController::Its(GicItsInfo { + its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, + }) + }; + let platform = Aarch64PlatformConfig { gic_distributor_base, gic_version, @@ -600,7 +649,10 @@ fn build_aarch64_topology( } else { builder.vps_per_socket(config.proc_count); } - Ok(builder.build(config.proc_count)?) + Ok(Aarch64TopologyResult { + processor_topology: builder.build(config.proc_count)?, + spi_layout, + }) } /// A VM that has been loaded and can be run. @@ -823,57 +875,8 @@ impl InitializedVm { #[cfg(guest_arch = "aarch64")] let processor_topology = { - use openvmm_defs::config::GicMsiConfig; - use vm_topology::processor::aarch64::GicItsInfo; - use vm_topology::processor::aarch64::GicMsiController; - use vm_topology::processor::aarch64::GicV2mInfo; - - // Resolve ITS vs v2m and determine v2m SPI count. - let arch_config = match &cfg.processor_topology.arch { - Some(ArchTopologyConfig::Aarch64(a)) => a, - _ => &Aarch64TopologyConfig::default(), - }; - let is_gicv2 = match &arch_config.gic_config { - Some(GicConfig::V2(_)) => true, - _ => !platform_info.supports_gic_v3, - }; - let v2m_spi_count = match &arch_config.gic_msi { - GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None, - GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT), - GicMsiConfig::Its => { - if is_gicv2 { - anyhow::bail!("ITS is incompatible with GICv2"); - } - if !platform_info.supports_its { - anyhow::bail!("ITS requested but the hypervisor does not support it"); - } - None - } - GicMsiConfig::V2m { spi_count } => Some(*spi_count), - }; - - // Resolve SPI layout — all SPI allocations in one deterministic pass. - let spi_layout = - super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput { - v2m_spi_count, - })?; - - // Build the GIC MSI controller from resolved SPIs. - let gic_msi = if let Some(count) = v2m_spi_count { - GicMsiController::V2m(GicV2mInfo { - frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, - spi_base: spi_layout - .v2m_spi_base - .expect("v2m base must be allocated when v2m_spi_count is Some"), - spi_count: count, - }) - } else { - GicMsiController::Its(GicItsInfo { - its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, - }) - }; - - build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)? + let result = build_aarch64_topology(&cfg.processor_topology, &platform_info)?; + result.processor_topology }; #[cfg(not(guest_arch = "aarch64"))] let processor_topology = build_x86_topology(&cfg.processor_topology)?; diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs index 97065590e8..7b468741e8 100644 --- a/openvmm/openvmm_core/src/worker/spi_layout.rs +++ b/openvmm/openvmm_core/src/worker/spi_layout.rs @@ -31,6 +31,7 @@ impl SpiAllocator { } /// Allocates a single SPI, returning its GIC INTID. + #[expect(dead_code)] // used when SMMU instances are configured fn alloc(&mut self, tag: &str) -> anyhow::Result { if self.cursor < self.range_start { anyhow::bail!("SPI exhausted allocating {tag}"); diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index c2144a8216..1ccf871307 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -80,9 +80,6 @@ pub const DEFAULT_GIC_V2M_MSI_FRAME_BASE: u64 = 0xEFFE_8000; /// Size of the v2m MSI frame (one 4KB page is the architectural minimum). pub const GIC_V2M_MSI_FRAME_SIZE: u64 = 0x1000; -/// Default number of SPIs reserved for PCIe MSIs when using GICv2m. -pub const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64; - /// Base address of the GICv3 ITS MMIO region. Must be 64 KiB aligned, /// below the v2m frame address, and not overlap other devices. /// The region extends from this base to base + GIC_ITS_SIZE (128 KiB). @@ -294,8 +291,9 @@ pub enum GicMsiConfig { Its, /// Force GICv2m for MSI delivery via SPIs. V2m { - /// Number of SPIs to reserve for PCIe MSIs. - spi_count: u32, + /// Number of SPIs to reserve for PCIe MSIs. Defaults to a + /// platform-specific value when `None`. + spi_count: Option, }, } diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 542253cb3f..fbd1c888de 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -1285,9 +1285,9 @@ async fn vm_config_from_command_line( gic_msi: match opt.gic_msi { cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto, cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its, - cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m { - spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT, - }, + cli_args::GicMsiCli::V2m => { + openvmm_defs::config::GicMsiConfig::V2m { spi_count: None } + } }, }, ); From 082f0cb8f51f034b405f8e6d888a63f2b4a14aef Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 20:52:45 -0700 Subject: [PATCH 4/7] fix --- openvmm/openvmm_core/src/worker/dispatch.rs | 4 +++- openvmm/openvmm_core/src/worker/spi_layout.rs | 23 ++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 1e9981e57c..e82da2ae46 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -612,7 +612,9 @@ fn build_aarch64_topology( }; // Resolve SPI layout — all SPI allocations in one deterministic pass. + let gic_nr_irqs = openvmm_defs::config::DEFAULT_GIC_NR_IRQS; let spi_layout = super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput { + gic_nr_irqs, v2m_spi_count, })?; @@ -637,7 +639,7 @@ fn build_aarch64_topology( gic_msi, pmu_gsiv, virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI, - gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS, + gic_nr_irqs, }; let mut builder = TopologyBuilder::new_aarch64(platform); diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs index 7b468741e8..cf93b62a0d 100644 --- a/openvmm/openvmm_core/src/worker/spi_layout.rs +++ b/openvmm/openvmm_core/src/worker/spi_layout.rs @@ -12,9 +12,10 @@ //! This is critical for hibernation — a resumed VM must get the same SPI //! layout as the original. //! -//! SPIs are allocated top-down from INTID 1019. This maximizes distance from -//! the guest-side vPCI MSI allocator (Hyper-V PCI driver in Linux), which -//! allocates bottom-up starting at INTID 64. +//! SPIs are allocated top-down from the highest SPI supported by the GIC +//! (determined by `gic_nr_irqs`). This maximizes distance from the guest-side +//! vPCI MSI allocator (Hyper-V PCI driver in Linux), which allocates bottom-up +//! starting at INTID 64. /// Top-down GIC SPI allocator. struct SpiAllocator { @@ -58,6 +59,9 @@ impl SpiAllocator { /// Inputs to the SPI layout resolver. pub(super) struct SpiLayoutInput { + /// Total number of GIC interrupt lines (INTIDs 0..gic_nr_irqs-1). + /// Determines the highest usable SPI. + pub gic_nr_irqs: u32, /// Number of SPIs to reserve for GICv2m MSI delivery. `None` when using /// ITS (no v2m block needed). pub v2m_spi_count: Option, @@ -72,11 +76,12 @@ pub(super) struct ResolvedSpiLayout { /// Resolves SPI assignments for all platform devices. /// /// All allocations happen here in a single top-down pass over the SPI range -/// `[64, 1019]`. The order of allocations determines the layout and must not -/// change across OpenVMM versions for a given config, or hibernation will -/// break. +/// `[64, gic_nr_irqs-1]`. The order of allocations determines the layout and +/// must not change across OpenVMM versions for a given config, or hibernation +/// will break. pub(super) fn resolve_spi_layout(input: &SpiLayoutInput) -> anyhow::Result { - let mut spi = SpiAllocator::new(64..=1019); + let max_intid = input.gic_nr_irqs.saturating_sub(1).min(1019); + let mut spi = SpiAllocator::new(64..=max_intid); // --- Allocation order (do not reorder!) --- @@ -96,16 +101,18 @@ mod tests { #[test] fn v2m_allocation() { let result = resolve_spi_layout(&SpiLayoutInput { + gic_nr_irqs: 992, v2m_spi_count: Some(64), }) .unwrap(); - assert_eq!(result.v2m_spi_base, Some(956)); + assert_eq!(result.v2m_spi_base, Some(928)); } #[test] fn its_skips_v2m() { let result = resolve_spi_layout(&SpiLayoutInput { + gic_nr_irqs: 992, v2m_spi_count: None, }) .unwrap(); From 03c51c118d1a4f042be9308a42e4e5711e83fef2 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 22:24:55 -0700 Subject: [PATCH 5/7] fix --- openvmm/openvmm_core/src/worker/spi_layout.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs index cf93b62a0d..87ed7d1031 100644 --- a/openvmm/openvmm_core/src/worker/spi_layout.rs +++ b/openvmm/openvmm_core/src/worker/spi_layout.rs @@ -20,6 +20,8 @@ /// Top-down GIC SPI allocator. struct SpiAllocator { range_start: u32, + /// One past the last allocated INTID, or `range_end + 1` when nothing + /// has been allocated yet. cursor: u32, } @@ -27,33 +29,31 @@ impl SpiAllocator { fn new(range: std::ops::RangeInclusive) -> Self { Self { range_start: *range.start(), - cursor: *range.end(), + cursor: *range.end() + 1, } } /// Allocates a single SPI, returning its GIC INTID. #[expect(dead_code)] // used when SMMU instances are configured fn alloc(&mut self, tag: &str) -> anyhow::Result { - if self.cursor < self.range_start { + if self.cursor <= self.range_start { anyhow::bail!("SPI exhausted allocating {tag}"); } - let intid = self.cursor; self.cursor -= 1; - Ok(intid) + Ok(self.cursor) } /// Allocates a contiguous block of `count` SPIs, returning the lowest /// GIC INTID in the block. fn alloc_block(&mut self, tag: &str, count: u32) -> anyhow::Result { - let available = self.cursor.saturating_sub(self.range_start) + 1; - if count == 0 || count > available { + let available = self.cursor - self.range_start; + if count > available { anyhow::bail!( "SPI exhausted allocating {tag}: need {count}, only {available} remaining" ); } - let base = self.cursor - count + 1; - self.cursor = base - 1; - Ok(base) + self.cursor -= count; + Ok(self.cursor) } } From 3ab0cca3516d8b1ee2cd5d796639f5109786a41f Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 14:27:57 -0700 Subject: [PATCH 6/7] smmu --- Cargo.lock | 19 + Cargo.toml | 1 + openhcl/underhill_core/src/loader/mod.rs | 1 + openvmm/openvmm_core/Cargo.toml | 1 + openvmm/openvmm_core/src/worker/dispatch.rs | 365 ++- .../openvmm_core/src/worker/memory_layout.rs | 25 + openvmm/openvmm_core/src/worker/spi_layout.rs | 47 +- .../src/worker/vm_loaders/linux.rs | 49 + openvmm/openvmm_defs/src/config.rs | 14 + openvmm/openvmm_entry/src/cli_args.rs | 5 + openvmm/openvmm_entry/src/lib.rs | 8 + petri/src/vm/openvmm/modify.rs | 29 + vm/acpi_spec/src/iort.rs | 98 + vm/devices/iommu/smmu/Cargo.toml | 24 + vm/devices/iommu/smmu/src/emulator.rs | 2139 +++++++++++++++++ vm/devices/iommu/smmu/src/lib.rs | 19 + vm/devices/iommu/smmu/src/shared.rs | 1371 +++++++++++ vm/devices/iommu/smmu/src/spec/cd.rs | 445 ++++ vm/devices/iommu/smmu/src/spec/commands.rs | 299 +++ vm/devices/iommu/smmu/src/spec/events.rs | 265 ++ vm/devices/iommu/smmu/src/spec/mod.rs | 17 + vm/devices/iommu/smmu/src/spec/pt.rs | 396 +++ vm/devices/iommu/smmu/src/spec/registers.rs | 708 ++++++ vm/devices/iommu/smmu/src/spec/ste.rs | 309 +++ vm/devices/iommu/smmu/src/translate.rs | 1046 ++++++++ vm/devices/pci/pci_core/src/bus_range.rs | 6 +- vmm_core/src/acpi_builder.rs | 384 ++- .../vmm_tests/tests/tests/multiarch/pcie.rs | 108 + 28 files changed, 8100 insertions(+), 98 deletions(-) create mode 100644 vm/devices/iommu/smmu/Cargo.toml create mode 100644 vm/devices/iommu/smmu/src/emulator.rs create mode 100644 vm/devices/iommu/smmu/src/lib.rs create mode 100644 vm/devices/iommu/smmu/src/shared.rs create mode 100644 vm/devices/iommu/smmu/src/spec/cd.rs create mode 100644 vm/devices/iommu/smmu/src/spec/commands.rs create mode 100644 vm/devices/iommu/smmu/src/spec/events.rs create mode 100644 vm/devices/iommu/smmu/src/spec/mod.rs create mode 100644 vm/devices/iommu/smmu/src/spec/pt.rs create mode 100644 vm/devices/iommu/smmu/src/spec/registers.rs create mode 100644 vm/devices/iommu/smmu/src/spec/ste.rs create mode 100644 vm/devices/iommu/smmu/src/translate.rs diff --git a/Cargo.lock b/Cargo.lock index 43ff006ebf..0fec691590 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5334,6 +5334,7 @@ dependencies = [ "scsi_core", "scsidisk", "serial_16550_resources", + "smmu", "sparse_mmap", "state_unit", "storvsp", @@ -7293,6 +7294,24 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smmu" +version = "0.0.0" +dependencies = [ + "anyhow", + "bitfield-struct 0.11.0", + "chipset_device", + "guestmem", + "inspect", + "open_enum", + "pal_event", + "parking_lot", + "pci_core", + "tracelimit", + "vmcore", + "zerocopy", +] + [[package]] name = "smoltcp" version = "0.12.0" diff --git a/Cargo.toml b/Cargo.toml index 98dfb3c210..22bf7a59d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -290,6 +290,7 @@ pci_bus = { path = "vm/devices/pci/pci_bus" } pci_core = { path = "vm/devices/pci/pci_core" } pci_resources = { path = "vm/devices/pci/pci_resources" } pcie = { path = "vm/devices/pci/pcie" } +smmu = { path = "vm/devices/iommu/smmu" } vpci = { path = "vm/devices/pci/vpci" } vpci_client = { path = "vm/devices/pci/vpci_client" } vpci_protocol = { path = "vm/devices/pci/vpci_protocol" } diff --git a/openhcl/underhill_core/src/loader/mod.rs b/openhcl/underhill_core/src/loader/mod.rs index 9182fe02d9..f737834ecc 100644 --- a/openhcl/underhill_core/src/loader/mod.rs +++ b/openhcl/underhill_core/src/loader/mod.rs @@ -492,6 +492,7 @@ pub fn write_uefi_config( // Not used for MADT/SRAT generation; only matters for FADT. hypervisor_vendor_identity: 0, virt_timer_ppi: processor_topology.virt_timer_ppi(), + smmu: Vec::new(), }, }; diff --git a/openvmm/openvmm_core/Cargo.toml b/openvmm/openvmm_core/Cargo.toml index 2adf4460a3..2afde03497 100644 --- a/openvmm/openvmm_core/Cargo.toml +++ b/openvmm/openvmm_core/Cargo.toml @@ -70,6 +70,7 @@ pci_bus.workspace = true pci_core.workspace = true pci_resources.workspace = true pcie.workspace = true +smmu.workspace = true scsi_core.workspace = true scsidisk.workspace = true serial_16550_resources.workspace = true diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index e82da2ae46..a1a9044f16 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -407,11 +407,22 @@ pub(crate) struct InitializedVm { chipset_low_mmio: MemoryRange, chipset_high_mmio: MemoryRange, vtl2_chipset_mmio: MemoryRange, + resolved_smmu_resources: Vec, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, } +/// Resolved resources for a single SMMUv3 instance. +struct ResolvedSmmuResources { + /// MMIO base address (from the memory layout allocator). + base: u64, + /// GIC INTID for the event queue interrupt (from the SPI allocator). + evtq_gsiv: u32, + /// GIC INTID for the global error interrupt (from the SPI allocator). + gerr_gsiv: u32, +} + trait ExtractTopologyConfig { fn to_config(&self) -> ProcessorTopologyConfig; } @@ -494,6 +505,7 @@ impl ExtractTopologyConfig for ProcessorTopology { None => PmuGsivConfig::Disabled, }, gic_msi: Default::default(), + smmu: Vec::new(), })), } } @@ -502,7 +514,6 @@ impl ExtractTopologyConfig for ProcessorTopology { #[cfg(guest_arch = "aarch64")] struct Aarch64TopologyResult { processor_topology: ProcessorTopology, - #[expect(dead_code)] // consumed by SMMU device wiring spi_layout: super::spi_layout::ResolvedSpiLayout, } @@ -616,6 +627,7 @@ fn build_aarch64_topology( let spi_layout = super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput { gic_nr_irqs, v2m_spi_count, + smmu_count: arch.smmu.len(), })?; // Build the GIC MSI controller from resolved SPIs. @@ -730,6 +742,12 @@ struct LoadedVmInner { automatic_guest_reset: bool, pcie_host_bridges: Vec, pcie_root_complexes: Vec>>, + /// SMMU configurations, one per instance. + #[cfg_attr(not(guest_arch = "aarch64"), expect(dead_code))] + smmu_configs: Vec, + /// Per-RC SMMU shared state, indexed parallel to `pcie_host_bridges`. + /// `None` for root complexes without an SMMU. + smmu_shared_states: Vec>>, pcie_hotplug_devices: Vec<( String, vmotherboard::DynamicDeviceUnit, @@ -876,9 +894,9 @@ impl InitializedVm { }; #[cfg(guest_arch = "aarch64")] - let processor_topology = { + let (processor_topology, spi_layout) = { let result = build_aarch64_topology(&cfg.processor_topology, &platform_info)?; - result.processor_topology + (result.processor_topology, result.spi_layout) }; #[cfg(not(guest_arch = "aarch64"))] let processor_topology = build_x86_topology(&cfg.processor_topology)?; @@ -938,12 +956,30 @@ impl InitializedVm { .filter(|(bus, _)| matches!(bus, VirtioBus::Mmio)) .count(); + // Count SMMU instances so the layout engine can allocate their MMIO. + let smmu_count = { + #[cfg(guest_arch = "aarch64")] + { + match &cfg.processor_topology.arch { + Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig { smmu, .. })) => { + smmu.len() + } + _ => 0, + } + } + #[cfg(not(guest_arch = "aarch64"))] + { + 0 + } + }; + let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(), layout: cfg.layout.clone(), pcie_root_complexes: &cfg.pcie_root_complexes, virtio_mmio_count, + smmu_count, vtl2_layout, physical_address_size, }) @@ -955,6 +991,23 @@ impl InitializedVm { let chipset_high_mmio = resolved_layout.chipset_high_mmio; let vtl2_chipset_mmio = resolved_layout.vtl2_chipset_mmio; + // Combine SMMU MMIO ranges with SPI layout. + cfg_if! { + if #[cfg(guest_arch = "aarch64")] { + let resolved_smmu_resources: Vec = resolved_layout.smmu_ranges + .iter() + .zip(&spi_layout.smmu) + .map(|(range, spis)| ResolvedSmmuResources { + base: range.start(), + evtq_gsiv: spis.evtq_gsiv, + gerr_gsiv: spis.gerr_gsiv, + }) + .collect(); + } else { + let resolved_smmu_resources: Vec = Vec::new(); + } + } + // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially // important on ARM64 where the physical address width used here is not @@ -1070,6 +1123,7 @@ impl InitializedVm { chipset_low_mmio, chipset_high_mmio, vtl2_chipset_mmio, + resolved_smmu_resources, processor_topology, igvm_file, driver_source, @@ -1101,6 +1155,7 @@ impl InitializedVm { chipset_low_mmio, chipset_high_mmio, vtl2_chipset_mmio, + resolved_smmu_resources, processor_topology, igvm_file, driver_source, @@ -1789,8 +1844,10 @@ impl InitializedVm { #[cfg(not(guest_arch = "aarch64"))] let use_its = false; - let (pcie_host_bridges, pcie_root_complexes) = { + let (pcie_host_bridges, pcie_rc_name_to_idx, pcie_root_complexes) = { let mut pcie_host_bridges = Vec::new(); + let mut pcie_rc_name_to_idx: std::collections::HashMap = + std::collections::HashMap::new(); let mut pcie_root_complexes = Vec::new(); for (rc, ranges) in cfg @@ -1859,13 +1916,14 @@ impl InitializedVm { high_mmio: ranges.high_mmio, }); + pcie_rc_name_to_idx.insert(rc.name.clone(), pcie_host_bridges.len() - 1); pcie_root_complexes.push(root_complex.clone()); let bus_id = vmotherboard::BusId::new(&rc.name); chipset_builder.register_weak_mutex_pcie_enumerator(bus_id, Box::new(root_complex)); } - (pcie_host_bridges, pcie_root_complexes) + (pcie_host_bridges, pcie_rc_name_to_idx, pcie_root_complexes) }; // Build a port-name→(segment, bus_range) map covering all ports in @@ -1984,44 +2042,118 @@ impl InitializedVm { Some(handle) }; + // Determine which SMMU instances to create. When active, PCIe devices + // on the covered root complexes get translating GuestMemory and + // SignalMsi wrappers that route DMA and MSI writes through the + // emulated SMMUv3. Each SMMU instance covers one root complex. + #[cfg(guest_arch = "aarch64")] + let smmu_instances: Vec = match &cfg + .processor_topology + .arch + { + Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig { smmu, .. })) => smmu.clone(), + _ => Vec::new(), + }; + #[cfg(not(guest_arch = "aarch64"))] + let smmu_instances: Vec = Vec::new(); + + // When SMMU instances are configured, instantiate a device for each + // and build lookup maps: + // - rc_name → shared_state (for per-device wiring and hotplug) + // - port_name → shared_state (for the per-device loop below) + // + // SPI assignments come from the SPI allocator via smmu_gsivs. + // IRQ_LINE_SET vectors are GSIV - 32 (the GIC target offset). + let mut smmu_shared_states: Vec>> = + vec![None; pcie_host_bridges.len()]; + let mut smmu_configs = Vec::new(); + for (idx, inst) in smmu_instances.iter().enumerate() { + // Look up the RC by name to get its index. + let rc_pos = match pcie_rc_name_to_idx.get(&inst.rc_name) { + Some(&i) => i, + None => { + anyhow::bail!( + "SMMU instance references unknown root complex {:?}", + inst.rc_name + ); + } + }; + let smmu = &resolved_smmu_resources[idx]; + let evtq_irq_vector = smmu.evtq_gsiv - *vmm_core::emuplat::gic::SPI_RANGE.start(); + let gerror_irq_vector = smmu.gerr_gsiv - *vmm_core::emuplat::gic::SPI_RANGE.start(); + let device_name = format!("smmu:{}", inst.rc_name); + let smmu_config = smmu::SmmuConfig { + sid_bits: 16, + oas: 0b010, + }; + let smmu_device = + chipset_builder + .arc_mutex_device(device_name.as_str()) + .add(|services| { + let evtq_irq = services.new_line(IRQ_LINE_SET, "evtq", evtq_irq_vector); + let gerror_irq = + services.new_line(IRQ_LINE_SET, "gerror", gerror_irq_vector); + smmu::SmmuDevice::new( + smmu.base, + gm.clone(), + &smmu_config, + Some(evtq_irq), + Some(gerror_irq), + ) + })?; + smmu_shared_states[rc_pos] = Some(smmu_device.lock().shared_state().clone()); + smmu_configs.push(vmm_core::acpi_builder::AcpiSmmuConfig { + rc_index: pcie_host_bridges[rc_pos].index, + segment: pcie_host_bridges[rc_pos].segment, + base: smmu.base, + event_gsiv: smmu.evtq_gsiv, + gerr_gsiv: smmu.gerr_gsiv, + }); + } + + // Build a port-name → SMMU shared state map. Each downstream port of + // an SMMU-covered root complex inherits that SMMU. + let smmu_port_map: std::collections::HashMap, Arc> = + smmu_shared_states + .iter() + .zip(pcie_root_complexes.iter()) + .flat_map(|(shared, rc)| { + let shared = shared.clone(); + rc.lock() + .downstream_ports() + .into_iter() + .filter_map(move |dpi| shared.as_ref().map(|s| (dpi.name, s.clone()))) + }) + .collect(); + + // Track which RCs have SMMUs (for VFIO blocking). + let mut smmu_per_rc = vec![false; pcie_host_bridges.len()]; + for inst in &smmu_instances { + if let Some(&i) = pcie_rc_name_to_idx.get(&inst.rc_name) { + smmu_per_rc[i] = true; + } + } + + // Build port-name set for ports behind SMMUs. + let smmu_s1_ports: std::collections::HashSet> = smmu_per_rc + .iter() + .zip(pcie_root_complexes.iter()) + .flat_map(|(&has_smmu, rc)| { + rc.lock() + .downstream_ports() + .into_iter() + .filter_map(move |dpi| if has_smmu { Some(dpi.name) } else { None }) + }) + .collect(); + // Resolve PCIe devices concurrently. // // When ITS is active, the root complex's ITS-wrapped SignalMsi // and IrqFd are shared across all devices on that complex. Each // device's MsiConnection carries a default BDF derived from the // port's AssignedBusRange, which the MsiTarget resolves lazily - // at interrupt delivery time. - - // Build per-segment ITS-wrapped signal_msi and irqfd. Each root - // complex connection already has ITS wrapping for port MSIs; we - // share the same wrapped instances for child devices. - let its_signal_msi: std::collections::HashMap> = - if use_its { - let mut map = std::collections::HashMap::new(); - if let Some(s) = partition.as_signal_msi(Vtl::Vtl0) { - for hb in &pcie_host_bridges { - map.entry(hb.segment).or_insert_with(|| { - Arc::new(pcie::its::ItsSignalMsi::new(s.clone(), hb.segment)) as _ - }); - } - } - map - } else { - std::collections::HashMap::new() - }; - let its_irqfd: std::collections::HashMap> = if use_its { - let mut map = std::collections::HashMap::new(); - if let Some(fd) = partition.irqfd() { - for hb in &pcie_host_bridges { - map.entry(hb.segment).or_insert_with(|| { - Arc::new(pcie::its::ItsIrqFd::new(fd.clone(), hb.segment)) as _ - }); - } - } - map - } else { - std::collections::HashMap::new() - }; + // at interrupt delivery time. When SMMU is enabled, per-device + // wrappers translate IOVAs and MSI addresses through the emulated SMMU. try_join_all(cfg.pcie_devices.into_iter().map(|dev_cfg| { let chipset_builder = &chipset_builder; @@ -2031,8 +2163,8 @@ impl InitializedVm { let partition = &partition; let mapper = &mapper; let port_info = &port_info; - let its_signal_msi = &its_signal_msi; - let its_irqfd = &its_irqfd; + let smmu_port_map = &smmu_port_map; + let smmu_s1_ports = &smmu_s1_ports; async move { let port_name: Arc = dev_cfg.port_name.into(); let pi = port_info.get(&port_name).ok_or_else(|| { @@ -2042,14 +2174,39 @@ impl InitializedVm { ) })?; + // Block VFIO devices behind S1-capable SMMUs. The + // emulated SMMU's S1 page tables are not programmed + // into the host IOMMU, so VFIO DMA would bypass S1 + // translation. This will be lifted when iommufd + // nested translation support is available. + if dev_cfg.resource.id() == "vfio" && smmu_s1_ports.contains(&port_name) { + anyhow::bail!( + "VFIO device on port {:?} is behind an S1-capable SMMU, \ + but iommufd nested translation is not available. \ + Either place the device on a root complex without an SMMU, \ + configure the SMMU for S2-only mode (--smmu ,s2-only), \ + or enable iommufd nested translation.", + &*port_name, + ); + } + let msi_conn = pci_core::msi::MsiConnection::new(pi.bus_range.clone(), 0); + let (dev_gm, signal_msi, irqfd) = build_pcie_msi_context( + partition.as_ref(), + gm, + &pi.bus_range, + pi.segment, + use_its, + smmu_port_map.get(&port_name), + ); + vmm_core::device_builder::build_pcie_device( chipset_builder, port_name.clone(), driver_source, resolver, - gm, + &dev_gm, dev_cfg.resource, partition.clone().into_doorbell_registration(Vtl::Vtl0), Some(mapper), @@ -2057,23 +2214,9 @@ impl InitializedVm { ) .await?; - // When ITS is active, use the per-segment wrapped - // SignalMsi and IrqFd. The device's MsiConnection - // carries a default BDF from the port's bus range. - let signal_msi = if use_its { - its_signal_msi.get(&pi.segment).cloned() - } else { - partition.as_signal_msi(Vtl::Vtl0) - }; if let Some(target) = signal_msi { msi_conn.connect(target); } - - let irqfd = if use_its { - its_irqfd.get(&pi.segment).cloned() - } else { - partition.irqfd() - }; if let Some(fd) = irqfd { msi_conn.connect_irqfd(fd); } @@ -2567,6 +2710,8 @@ impl InitializedVm { pcie_host_bridges, pcie_root_complexes, pcie_hotplug_devices: Vec::new(), + smmu_configs, + smmu_shared_states, }, }; @@ -2614,6 +2759,7 @@ impl LoadedVmInner { 0 }, virt_timer_ppi: self.processor_topology.virt_timer_ppi(), + smmu: self.smmu_configs.clone(), }, }; @@ -2709,6 +2855,7 @@ impl LoadedVmInner { enable_serial, &self.processor_topology, &self.pcie_host_bridges, + &self.smmu_configs, build_acpi, )?; @@ -3098,7 +3245,25 @@ impl LoadedVm { .expect("port was just found above") .bus_range; - let msi_conn = pci_core::msi::MsiConnection::new(bus_range, 0); + let segment = self.inner.pcie_host_bridges[rc_idx].segment; + let msi_conn = pci_core::msi::MsiConnection::new(bus_range.clone(), 0); + + #[cfg(guest_arch = "aarch64")] + let use_its = matches!( + self.inner.processor_topology.gic_msi(), + vm_topology::processor::aarch64::GicMsiController::Its(_) + ); + #[cfg(not(guest_arch = "aarch64"))] + let use_its = false; + + let (dev_gm, signal_msi, irqfd) = build_pcie_msi_context( + self.inner.partition.as_ref(), + &self.inner.gm, + &bus_range, + segment, + use_its, + self.inner.smmu_shared_states[rc_idx].as_ref(), + ); let (unit, device) = self.inner.chipset_devices.add_dyn_device( &self.inner.driver_source, @@ -3112,7 +3277,7 @@ impl LoadedVm { msi_target: msi_conn.target(), register_mmio, driver_source: &self.inner.driver_source, - guest_memory: &self.inner.gm, + guest_memory: &dev_gm, doorbell_registration: self.inner.partition.clone().into_doorbell_registration(Vtl::Vtl0), shared_mem_mapper: None, }, @@ -3123,29 +3288,13 @@ impl LoadedVm { }, ).await?; - // Connect the MSI target and IrqFd, wrapping - // with ITS segment translation when needed. - #[cfg(guest_arch = "aarch64")] - let use_its = matches!( - self.inner.processor_topology.gic_msi(), - vm_topology::processor::aarch64::GicMsiController::Its(_) - ); - #[cfg(not(guest_arch = "aarch64"))] - let use_its = false; - let segment = self.inner.pcie_host_bridges[rc_idx].segment; - if let Some(s) = self.inner.partition.as_signal_msi(Vtl::Vtl0) { - if use_its { - msi_conn.connect(Arc::new(pcie::its::ItsSignalMsi::new(s, segment))); - } else { - msi_conn.connect(s); - } + // Connect the signal_msi and irqfd (possibly + // ITS-wrapped and/or SMMU-wrapped). + if let Some(target) = signal_msi { + msi_conn.connect(target); } - if let Some(fd) = self.inner.partition.irqfd() { - if use_its { - msi_conn.connect_irqfd(Arc::new(pcie::its::ItsIrqFd::new(fd, segment))); - } else { - msi_conn.connect_irqfd(fd); - } + if let Some(fd) = irqfd { + msi_conn.connect_irqfd(fd); } // Wrap the device as a GenericPciBusDevice for the port. @@ -3413,6 +3562,59 @@ impl LoadedVm { } } +/// Build the layered GuestMemory, SignalMsi, and IrqFd for a PCIe device. +/// +/// When ITS is active, wraps SignalMsi and IrqFd with segment translation. +/// When an SMMU covers the device, additionally wraps GuestMemory, SignalMsi, +/// and IrqFd with SMMU translation. +fn build_pcie_msi_context( + partition: &dyn HvlitePartition, + gm: &GuestMemory, + bus_range: &pci_core::bus_range::AssignedBusRange, + segment: u16, + use_its: bool, + smmu_shared: Option<&Arc>, +) -> ( + GuestMemory, + Option>, + Option>, +) { + // Base signal_msi: ITS-wrapped or plain. + let base_signal_msi = if use_its { + partition.as_signal_msi(Vtl::Vtl0).map(|s| { + Arc::new(pcie::its::ItsSignalMsi::new(s, segment)) as Arc + }) + } else { + partition.as_signal_msi(Vtl::Vtl0) + }; + + // Base irqfd: ITS-wrapped or plain. + let base_irqfd = if use_its { + partition.irqfd().map(|fd| { + Arc::new(pcie::its::ItsIrqFd::new(fd, segment)) as Arc + }) + } else { + partition.irqfd() + }; + + // When an SMMU covers this device, wrap GuestMemory and SignalMsi/IrqFd + // with SMMU translation. stream_id_base is 0 because each SMMU is 1:1 + // with its root complex — stream IDs are plain BDFs. + if let Some(shared) = smmu_shared { + let (translating_gm, smmu_msi) = base_signal_msi + .map(|inner_msi| { + let (gm, msi) = shared.create_device_context(bus_range.clone(), 0, gm, inner_msi); + (gm, Some(msi as Arc)) + }) + .unwrap_or_else(|| (gm.clone(), None)); + let irqfd = + base_irqfd.map(|fd| shared.create_irqfd(0, fd) as Arc); + (translating_gm, smmu_msi, irqfd) + } else { + (gm.clone(), base_signal_msi, base_irqfd) + } +} + #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))] fn add_devices_to_dsdt_x64( mem_layout: &MemoryLayout, @@ -3511,6 +3713,13 @@ fn add_devices_to_dsdt_arm64( // Always place under VMOD, not PCI0 — ARM64 doesn't use the x86 // PCI0 DSDT node. dsdt.add_vmbus(false, Some(VMBUS_INTID)); + } else if mem_layout.mmio().len() >= 2 { + // Even without HV enlightenments (e.g. KVM aarch64), the MMIO + // module is needed so the kernel knows the available MMIO address + // ranges for PCIe BAR allocation. + let low_mmio_gap = mem_layout.mmio()[0]; + let high_mmio_gap: MemoryRange = mem_layout.mmio()[1]; + dsdt.add_mmio_module(low_mmio_gap, high_mmio_gap); } if enable_serial { diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs index 3d42ca88d2..85e87c30ff 100644 --- a/openvmm/openvmm_core/src/worker/memory_layout.rs +++ b/openvmm/openvmm_core/src/worker/memory_layout.rs @@ -32,6 +32,9 @@ const PAGE_SIZE: u64 = 4096; const TWO_MB: u64 = 2 * 1024 * 1024; const GB: u64 = 1024 * 1024 * 1024; +/// SMMUv3 MMIO region size: two 64 KiB pages (page 0 + page 1). +const SMMU_SIZE: u64 = 0x2_0000; + /// PCIe ECAM: 32 devices * 8 functions * 4 KiB config space = 1 MB per bus. const PCIE_ECAM_BYTES_PER_BUS: u64 = 32 * 8 * 4096; @@ -61,6 +64,10 @@ pub(super) struct ResolvedMemoryLayout { /// VTL2-private chipset MMIO range, reported to VTL2 VMBus via the device /// tree. `EMPTY` when VTL2 is not configured or has no chipset MMIO. pub vtl2_chipset_mmio: MemoryRange, + /// Resolved MMIO ranges for SMMUv3 instances, one per configured SMMU. + /// Each range is `SMMU_SIZE` bytes. Empty when no SMMUs are configured. + #[cfg_attr(not(guest_arch = "aarch64"), expect(dead_code))] + pub smmu_ranges: Vec, } #[derive(Debug)] @@ -84,6 +91,9 @@ pub(super) struct MemoryLayoutInput<'a> { /// Number of virtio-mmio device slots to allocate in 32-bit MMIO space. /// A single contiguous region of `count * 4 KiB` is allocated. pub virtio_mmio_count: usize, + /// Number of SMMUv3 instances to allocate MMIO for. Each instance requires + /// `SMMU_SIZE` bytes (128 KiB), 128 KiB aligned, in 32-bit MMIO space. + pub smmu_count: usize, /// Optional IGVM VTL2 private-memory request. This is allocated after all /// VTL0-visible RAM and MMIO and is carried separately from ordinary RAM. pub vtl2_layout: Option, @@ -231,6 +241,19 @@ pub(super) fn resolve_memory_layout( ); } + // SMMUv3: allocate one 128 KiB region per instance. Placed below 4 GiB + // alongside other aarch64 system devices (GIC, ITS, PL011). + let mut smmu_ranges: Vec = vec![MemoryRange::EMPTY; input.smmu_count]; + for (idx, range) in smmu_ranges.iter_mut().enumerate() { + builder.request( + format!("smmu-{idx}"), + range, + SMMU_SIZE, + SMMU_SIZE, + Placement::Mmio32, + ); + } + // RAM request order is part of the NUMA compatibility contract: the first // request maps to vnode 0, the second to vnode 1, and so on. For GB-sized // nodes, use GB alignment so holes do not create sub-GB RAM chunks. For @@ -385,6 +408,7 @@ pub(super) fn resolve_memory_layout( chipset_low_mmio, chipset_high_mmio, vtl2_chipset_mmio, + smmu_ranges, }) } @@ -485,6 +509,7 @@ mod tests { layout: DEFAULT_LAYOUT, pcie_root_complexes: &[], virtio_mmio_count: 0, + smmu_count: 0, vtl2_layout, physical_address_size: 46, } diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs index 87ed7d1031..b8ddad5074 100644 --- a/openvmm/openvmm_core/src/worker/spi_layout.rs +++ b/openvmm/openvmm_core/src/worker/spi_layout.rs @@ -34,7 +34,6 @@ impl SpiAllocator { } /// Allocates a single SPI, returning its GIC INTID. - #[expect(dead_code)] // used when SMMU instances are configured fn alloc(&mut self, tag: &str) -> anyhow::Result { if self.cursor <= self.range_start { anyhow::bail!("SPI exhausted allocating {tag}"); @@ -65,12 +64,25 @@ pub(super) struct SpiLayoutInput { /// Number of SPIs to reserve for GICv2m MSI delivery. `None` when using /// ITS (no v2m block needed). pub v2m_spi_count: Option, + /// Number of SMMUv3 instances. Each instance gets two SPIs (event queue + /// and global error). + pub smmu_count: usize, } /// Resolved SPI assignments for all platform devices. pub(super) struct ResolvedSpiLayout { /// GICv2m SPI base INTID. `None` when using ITS. pub v2m_spi_base: Option, + /// Per-SMMU SPI assignments, one entry per instance. + pub smmu: Vec, +} + +/// Allocated SPI pair for a single SMMUv3 instance. +pub(super) struct SmmuSpiAllocation { + /// GIC INTID for the event queue interrupt. + pub evtq_gsiv: u32, + /// GIC INTID for the global error interrupt. + pub gerr_gsiv: u32, } /// Resolves SPI assignments for all platform devices. @@ -91,7 +103,16 @@ pub(super) fn resolve_spi_layout(input: &SpiLayoutInput) -> anyhow::Result, pcie_host_bridges: &[PcieHostBridge], + smmu_configs: &[vmm_core::acpi_builder::AcpiSmmuConfig], initrd_start: u64, initrd_end: u64, ) -> Result, fdt::builder::Error> { @@ -157,6 +158,8 @@ fn build_dt( const PL011_SERIAL0_IRQ: u32 = 1; const PL011_SERIAL1_BASE: u64 = 0xEFFEB000; const PL011_SERIAL1_IRQ: u32 = 2; + /// SMMUv3 MMIO region size: two 64 KiB pages (page 0 + page 1). + const SMMU_SIZE: u64 = 0x2_0000; let num_cpus = processor_topology.vps().len(); @@ -232,12 +235,16 @@ fn build_dt( let p_msi_controller = builder.add_string("msi-controller")?; let p_arm_msi_base_spi = builder.add_string("arm,msi-base-spi")?; let p_arm_msi_num_spis = builder.add_string("arm,msi-num-spis")?; + let p_iommu_cells = builder.add_string("#iommu-cells")?; + let p_iommu_map = builder.add_string("iommu-map")?; // Property handle values. const PHANDLE_GIC: u32 = 1; const PHANDLE_APB_PCLK: u32 = 2; const PHANDLE_V2M: u32 = 3; const PHANDLE_ITS: u32 = 4; + // SMMU phandles start at 5: SMMU instance N gets phandle 5 + N. + const PHANDLE_SMMU_BASE: u32 = 5; const GIC_SPI: u32 = 0; const GIC_PPI: u32 = 1; @@ -362,6 +369,39 @@ fn build_dt( GicMsiController::None => gic_node.end_node()?, }; + // SMMUv3 nodes (one per configured instance). + // Build a lookup from RC index → phandle for the iommu-map entries below. + let mut smmu_phandles: Vec<(u32, u32)> = Vec::new(); + for (idx, smmu) in smmu_configs.iter().enumerate() { + let phandle = PHANDLE_SMMU_BASE + idx as u32; + smmu_phandles.push((smmu.rc_index, phandle)); + // SPI interrupts use GIC_SPI encoding. The GSIV is the full INTID + // (e.g., 35), and the DT `interrupts` property wants the SPI number + // (INTID - 32) for GIC_SPI type. + let evtq_spi = smmu.event_gsiv - 32; + let gerr_spi = smmu.gerr_gsiv - 32; + root_builder = root_builder + .start_node(format!("smmu@{:x}", smmu.base).as_str())? + .add_str(p_compatible, "arm,smmu-v3")? + .add_u64_array(p_reg, &[smmu.base, SMMU_SIZE])? + .add_u32_array( + p_interrupts, + &[ + GIC_SPI, + evtq_spi, + IRQ_TYPE_EDGE_RISING, + GIC_SPI, + gerr_spi, + IRQ_TYPE_EDGE_RISING, + ], + )? + .add_str_array(p_interrupt_names, &["eventq", "gerror"])? + .add_u32(p_iommu_cells, 1)? + .add_u32(p_phandle, phandle)? + .add_null(p_dma_coherent)? + .end_node()?; + } + // ARM64 Architectural Timer. // The DT `interrupts` property uses the PPI offset (INTID - 16). assert!((16..32).contains(&processor_topology.virt_timer_ppi())); @@ -457,6 +497,13 @@ fn build_dt( } GicMsiController::None => {} } + if let Some((_, phandle)) = smmu_phandles.iter().find(|(idx, _)| *idx == bridge.index) { + // iommu-map: <&smmu_phandle> + // Maps the full RID range (0..0x10000) for this root complex + // through its SMMU instance. stream_id_base is 0 because each + // SMMU is 1:1 with its RC — stream IDs are plain BDFs. + node = node.add_u32_array(p_iommu_map, &[0, *phandle, 0, 0x10000])?; + } root_builder = node.end_node()?; } @@ -787,6 +834,7 @@ pub fn load_linux_arm64( enable_serial: bool, processor_topology: &ProcessorTopology, pcie_host_bridges: &[PcieHostBridge], + smmu_configs: &[vmm_core::acpi_builder::AcpiSmmuConfig], build_acpi: Option vmm_core::acpi_builder::BuiltAcpiTables>, ) -> Result, Error> { let mut loader = Loader::new(gm.clone(), cfg.mem_layout, hvdef::Vtl::Vtl0); @@ -847,6 +895,7 @@ pub fn load_linux_arm64( enable_serial, processor_topology, pcie_host_bridges, + smmu_configs, initrd_start, initrd_end, ) diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 1ccf871307..ad04398eab 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -297,11 +297,25 @@ pub enum GicMsiConfig { }, } +/// Per-instance SMMUv3 configuration for an aarch64 VM. +/// +/// Each instance covers one PCIe root complex, identified by name. +/// The SMMU's MMIO address is allocated dynamically by the memory layout +/// engine. +#[derive(Debug, Protobuf, Clone)] +pub struct SmmuInstanceConfig { + /// Name of the PCIe root complex this SMMU covers. + pub rc_name: String, +} + #[derive(Debug, Protobuf, Default, Clone)] pub struct Aarch64TopologyConfig { pub gic_config: Option, pub pmu_gsiv: PmuGsivConfig, pub gic_msi: GicMsiConfig, + /// SMMUv3 IOMMU instances. Each entry creates an SMMU for one PCIe root + /// complex (identified by name). Empty means no SMMU. + pub smmu: Vec, } /// GIC configuration for the virtual machine. diff --git a/openvmm/openvmm_entry/src/cli_args.rs b/openvmm/openvmm_entry/src/cli_args.rs index b9889e8ad5..d2b10d17c0 100644 --- a/openvmm/openvmm_entry/src/cli_args.rs +++ b/openvmm/openvmm_entry/src/cli_args.rs @@ -391,6 +391,11 @@ options: #[clap(long, default_value = "auto")] pub gic_msi: GicMsiCli, + /// enable SMMUv3 IOMMU for an aarch64 PCIe root complex (repeatable, e.g. --smmu rc0 --smmu rc1) + #[cfg(guest_arch = "aarch64")] + #[clap(long, value_name = "RC_NAME")] + pub smmu: Vec, + /// COM1 binding (console | stderr | listen=\ | file=\ (overwrites) | listen=tcp:\:\ | term[=\]\[,name=\\] | none) #[clap(long, value_name = "SERIAL")] pub com1: Option, diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index fbd1c888de..7e28c7bffb 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -1276,6 +1276,13 @@ async fn vm_config_from_command_line( vmbus_devices.push((openhcl_vtl, resource)); } + #[cfg(guest_arch = "aarch64")] + let smmu_instances: Vec = opt + .smmu + .iter() + .map(|s| openvmm_defs::config::SmmuInstanceConfig { rc_name: s.clone() }) + .collect(); + #[cfg(guest_arch = "aarch64")] let topology_arch = openvmm_defs::config::ArchTopologyConfig::Aarch64( openvmm_defs::config::Aarch64TopologyConfig { @@ -1289,6 +1296,7 @@ async fn vm_config_from_command_line( openvmm_defs::config::GicMsiConfig::V2m { spi_count: None } } }, + smmu: smmu_instances, }, ); #[cfg(guest_arch = "x86_64")] diff --git a/petri/src/vm/openvmm/modify.rs b/petri/src/vm/openvmm/modify.rs index dcaadb322c..f7ea3077f5 100644 --- a/petri/src/vm/openvmm/modify.rs +++ b/petri/src/vm/openvmm/modify.rs @@ -29,6 +29,7 @@ use openvmm_defs::config::PcieMmioRangeConfig; use openvmm_defs::config::PcieRootComplexConfig; use openvmm_defs::config::PcieRootPortConfig; use openvmm_defs::config::PcieSwitchConfig; +use openvmm_defs::config::SmmuInstanceConfig; use openvmm_defs::config::VpciDeviceConfig; use openvmm_defs::config::Vtl2BaseAddressType; use vm_resource::IntoResource; @@ -301,6 +302,34 @@ impl PetriVmConfigOpenVmm { self } + /// Enable SMMUv3 IOMMU on the specified root complexes (aarch64 only). + /// + /// Each name must match a root complex added via + /// [`with_pcie_root_topology`](Self::with_pcie_root_topology). The SMMU + /// provides stage 1 IOVA translation for devices behind those root + /// complexes. + pub fn with_smmu(mut self, rc_names: &[&str]) -> Self { + let arch = self + .config + .processor_topology + .arch + .as_mut() + .expect("arch topology not set"); + + match arch { + openvmm_defs::config::ArchTopologyConfig::Aarch64(aarch64) => { + aarch64.smmu = rc_names + .iter() + .map(|name| SmmuInstanceConfig { + rc_name: name.to_string(), + }) + .collect(); + } + _ => panic!("SMMU is only supported on aarch64"), + } + self + } + /// This is intended for special one-off use cases. As soon as something /// is needed in multiple tests we should consider making it a supported /// pattern. diff --git a/vm/acpi_spec/src/iort.rs b/vm/acpi_spec/src/iort.rs index d8000f1bec..b0cd71df82 100644 --- a/vm/acpi_spec/src/iort.rs +++ b/vm/acpi_spec/src/iort.rs @@ -18,9 +18,11 @@ pub const IORT_NODE_OFFSET: u32 = size_of::() as u32 + size_of::< pub const IORT_NODE_TYPE_ITS_GROUP: u8 = 0x00; pub const IORT_NODE_TYPE_PCI_ROOT_COMPLEX: u8 = 0x02; +pub const IORT_NODE_TYPE_SMMUV3: u8 = 0x04; pub const IORT_PCI_ROOT_COMPLEX_REVISION: u8 = 3; pub const IORT_ITS_GROUP_REVISION: u8 = 1; +pub const IORT_SMMUV3_REVISION: u8 = 5; pub const IORT_NODE_COHERENT: u32 = 0x00000001; pub const IORT_MEMORY_ACCESS_COHERENCY: u8 = 1 << 0; @@ -203,3 +205,99 @@ impl IortItsGroup { } const_assert_eq!(size_of::(), 20); + +/// SMMUv3 node flags. +pub const IORT_SMMUV3_FLAG_COHACC: u32 = 1 << 0; +/// `device_id_mapping_index` is valid (IORT rev E.e / node rev 5+). +pub const IORT_SMMUV3_FLAG_DEVICEID_VALID: u32 = 1 << 4; + +/// SMMUv3 model: generic SMMU-v3. +pub const IORT_SMMUV3_MODEL_GENERIC: u32 = 0; + +/// SMMUv3 node per IORT spec DEN0049E §E.4. +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct IortSmmuV3 { + pub header: IortNodeHeader, + pub base_address: u64_ne, + pub flags: u32_ne, + pub reserved: u32_ne, + pub vatos_address: u64_ne, + pub model: u32_ne, + pub event_gsiv: u32_ne, + pub pri_gsiv: u32_ne, + pub gerr_gsiv: u32_ne, + pub sync_gsiv: u32_ne, + pub proximity_domain: u32_ne, + pub device_id_mapping_index: u32_ne, +} + +impl IortSmmuV3 { + /// Create an SMMUv3 node with COHACC set, wired SPI interrupts (GSIVs), + /// and the specified number of ID mappings. The `length` field in the + /// header includes space for `mapping_count` trailing `IortIdMapping` + /// entries, which must be appended separately. + pub fn new( + identifier: u32, + base_address: u64, + mapping_count: u32, + event_gsiv: u32, + gerr_gsiv: u32, + ) -> Self { + Self::new_with_device_id_mapping( + identifier, + base_address, + mapping_count, + event_gsiv, + gerr_gsiv, + 0, + ) + } + + /// Create an SMMUv3 node with an explicit `device_id_mapping_index`. + /// + /// `device_id_mapping_index` selects which ID mapping entry Linux uses + /// for the SMMU's own MSI domain lookup. That mapping must have the + /// `IORT_ID_SINGLE_MAPPING` flag set. When set, the `DEVICEID_VALID` + /// flag is automatically added to the node flags. + pub fn new_with_device_id_mapping( + identifier: u32, + base_address: u64, + mapping_count: u32, + event_gsiv: u32, + gerr_gsiv: u32, + device_id_mapping_index: u32, + ) -> Self { + let mut header = IortNodeHeader::new::( + IORT_NODE_TYPE_SMMUV3, + IORT_SMMUV3_REVISION, + identifier, + mapping_count, + ); + let total = + size_of::() as u16 + (mapping_count as u16) * size_of::() as u16; + header.length = total.into(); + Self { + header, + base_address: base_address.into(), + flags: (IORT_SMMUV3_FLAG_COHACC + | if mapping_count > 0 { + IORT_SMMUV3_FLAG_DEVICEID_VALID + } else { + 0 + }) + .into(), + reserved: 0.into(), + vatos_address: 0.into(), + model: IORT_SMMUV3_MODEL_GENERIC.into(), + event_gsiv: event_gsiv.into(), + pri_gsiv: 0.into(), + gerr_gsiv: gerr_gsiv.into(), + sync_gsiv: 0.into(), + proximity_domain: 0.into(), + device_id_mapping_index: device_id_mapping_index.into(), + } + } +} + +const_assert_eq!(size_of::(), 68); diff --git a/vm/devices/iommu/smmu/Cargo.toml b/vm/devices/iommu/smmu/Cargo.toml new file mode 100644 index 0000000000..c6bc7868cd --- /dev/null +++ b/vm/devices/iommu/smmu/Cargo.toml @@ -0,0 +1,24 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +[package] +name = "smmu" +edition.workspace = true +rust-version.workspace = true + +[dependencies] +anyhow.workspace = true +bitfield-struct.workspace = true +chipset_device.workspace = true +guestmem.workspace = true +inspect.workspace = true +open_enum.workspace = true +pal_event.workspace = true +parking_lot.workspace = true +pci_core.workspace = true +tracelimit.workspace = true +vmcore.workspace = true +zerocopy.workspace = true + +[lints] +workspace = true diff --git a/vm/devices/iommu/smmu/src/emulator.rs b/vm/devices/iommu/smmu/src/emulator.rs new file mode 100644 index 0000000000..65d5a6fd2e --- /dev/null +++ b/vm/devices/iommu/smmu/src/emulator.rs @@ -0,0 +1,2139 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 device emulator — register file and MMIO dispatch. + +use crate::shared::SmmuSharedState; +use crate::spec::commands::CmdEntry; +use crate::spec::commands::CmdOpcode; +use crate::spec::commands::CmdSync; +use crate::spec::commands::SyncCs; +use crate::spec::registers; +use chipset_device::ChipsetDevice; +use chipset_device::io::IoError; +use chipset_device::io::IoResult; +use chipset_device::mmio::MmioIntercept; +use guestmem::GuestMemory; +use inspect::Inspect; +use inspect::InspectMut; +use std::ops::RangeInclusive; +use std::sync::Arc; +use vmcore::device_state::ChangeDeviceState; +use vmcore::line_interrupt::LineInterrupt; +use vmcore::save_restore::RestoreError; +use vmcore::save_restore::SaveError; +use vmcore::save_restore::SaveRestore; +use vmcore::save_restore::SavedStateNotSupported; + +/// SMMUv3 device configuration. +#[derive(Debug, Clone)] +pub struct SmmuConfig { + /// Number of StreamID bits (max 32, typically 16). + pub sid_bits: u8, + /// Output address size encoding (IDR5.OAS value). + pub oas: u8, +} + +impl Default for SmmuConfig { + fn default() -> Self { + Self { + sid_bits: 16, + oas: 0b010, // 40-bit OAS + } + } +} + +/// Per-queue MSI configuration registers. +#[derive(Debug, Default, Inspect)] +struct MsiConfig { + /// MSI address (64-bit, from IRQ_CFG0). + addr: u64, + /// MSI data payload (32-bit, from IRQ_CFG1). + data: u32, + /// MSI attributes (32-bit, from IRQ_CFG2). + attr: u32, +} + +/// SMMUv3 device emulator. +/// +/// Implements MMIO register access for the SMMUv3 register file. The device +/// responds to reads/writes across a 128KB region (page 0 + page 1). +#[derive(InspectMut)] +pub struct SmmuDevice { + // Static configuration + #[inspect(skip)] + mmio_region: (&'static str, RangeInclusive), + #[inspect(skip)] + mmio_base: u64, + + // Guest memory for reading queues and page tables. + #[inspect(skip)] + guest_memory: GuestMemory, + + // Shared state for per-device translation wrappers. + #[inspect(skip)] + shared_state: Arc, + + // Identification registers (read-only, set at construction). + idr0: registers::Idr0, + idr1: registers::Idr1, + #[inspect(hex)] + idr2: u32, + #[inspect(hex)] + idr3: u32, + #[inspect(hex)] + idr4: u32, + idr5: registers::Idr5, + #[inspect(hex)] + iidr: u32, + #[inspect(hex)] + aidr: u32, + + // Control registers. + cr0: registers::Cr0, + cr0ack: registers::Cr0, + cr1: registers::Cr1, + cr2: registers::Cr2, + gbpa: registers::Gbpa, + + // Interrupt control. + irq_ctrl: registers::IrqCtrl, + irq_ctrlack: registers::IrqCtrl, + + // Stream table base. + #[inspect(hex)] + strtab_base: u64, + strtab_base_cfg: registers::StrtabBaseCfg, + + // Command queue. + #[inspect(hex)] + cmdq_base: u64, + cmdq_prod: u32, + cmdq_cons: registers::CmdqCons, + + // Event queue base register (raw value for MMIO read/write). + // EVTQ producer/consumer state lives in SmmuSharedState. + #[inspect(hex)] + evtq_base: u64, + + // MSI configuration (stored for guest register access, not used for + // interrupt delivery since IDR0.MSI=0). + gerror_msi: MsiConfig, + evtq_msi: MsiConfig, + cmdq_msi: MsiConfig, +} + +impl SmmuDevice { + /// Creates a new SMMUv3 device. + /// + /// `mmio_base` is the physical address for the 128KB MMIO region. + /// `guest_memory` is used for reading command/event queues and page tables. + /// `evtq_irq` and `gerror_irq` are wired SPI interrupt lines for event + /// queue and global error signaling. + pub fn new( + mmio_base: u64, + guest_memory: GuestMemory, + config: &SmmuConfig, + evtq_irq: Option, + gerror_irq: Option, + ) -> Self { + let idr0 = registers::Idr0::new() + .with_s1p(true) + .with_s2p(false) + .with_ttf(0b10) // AArch64 only + .with_cohacc(true) + .with_asid16(true) + .with_msi(false) + .with_ttendian(0b10) // Little-endian + .with_st_level(0b00); // Linear stream table only + + let idr1 = registers::Idr1::new() + .with_sidsize(config.sid_bits) + .with_ssidsize(0) + .with_cmdqs(8) // 256 entries max + .with_eventqs(8) // 256 entries max + .with_attr_types_ovr(true) + .with_tables_preset(false) + .with_queues_preset(false) + .with_rel(false); + + let idr5 = registers::Idr5::new() + .with_oas(config.oas) + .with_gran4k(true) + .with_gran16k(false) + .with_gran64k(false); + + // GBPA defaults to ABORT=1 (abort all transactions when SMMU is disabled). + let gbpa = registers::Gbpa::new().with_abort(true); + + let shared_state = SmmuSharedState::new(guest_memory.clone(), evtq_irq, gerror_irq); + + SmmuDevice { + mmio_region: ( + "smmu", + mmio_base..=mmio_base + registers::MMIO_REGION_SIZE - 1, + ), + mmio_base, + guest_memory, + shared_state, + + idr0, + idr1, + idr2: 0, + idr3: 0, + idr4: 0, + idr5, + iidr: 0, + aidr: 0x03, // SMMUv3.3 + + cr0: registers::Cr0::new(), + cr0ack: registers::Cr0::new(), + cr1: registers::Cr1::new(), + cr2: registers::Cr2::new(), + gbpa, + + irq_ctrl: registers::IrqCtrl::new(), + irq_ctrlack: registers::IrqCtrl::new(), + + strtab_base: 0, + strtab_base_cfg: registers::StrtabBaseCfg::new(), + + cmdq_base: 0, + cmdq_prod: 0, + cmdq_cons: registers::CmdqCons::new(), + + evtq_base: 0, + + gerror_msi: MsiConfig::default(), + evtq_msi: MsiConfig::default(), + cmdq_msi: MsiConfig::default(), + } + } + + /// Returns the shared state for creating per-device translation wrappers. + pub fn shared_state(&self) -> &Arc { + &self.shared_state + } + + /// Handles a 32-bit MMIO read at the given offset from the device base. + fn read_reg32(&self, offset: u32) -> u32 { + match offset as u16 { + registers::IDR0 => self.idr0.into(), + registers::IDR1 => self.idr1.into(), + registers::IDR2 => self.idr2, + registers::IDR3 => self.idr3, + registers::IDR4 => self.idr4, + registers::IDR5 => self.idr5.into(), + registers::IIDR => self.iidr, + registers::AIDR => self.aidr, + + registers::CR0 => self.cr0.into(), + registers::CR0ACK => self.cr0ack.into(), + registers::CR1 => self.cr1.into(), + registers::CR2 => self.cr2.into(), + registers::STATUSR => 0, + registers::GBPA => self.gbpa.into(), + registers::AGBPA => 0, + + registers::IRQ_CTRL => self.irq_ctrl.into(), + registers::IRQ_CTRLACK => self.irq_ctrlack.into(), + + registers::GERROR => self.shared_state.read_gerror().into(), + registers::GERRORN => self.shared_state.read_gerrorn().into(), + + registers::STRTAB_BASE_CFG => self.strtab_base_cfg.into(), + + registers::CMDQ_PROD => self.cmdq_prod, + registers::CMDQ_CONS => self.cmdq_cons.into(), + + // Page 0 read of GERROR_IRQ_CFG1 + registers::GERROR_IRQ_CFG1 => self.gerror_msi.data, + registers::GERROR_IRQ_CFG2 => self.gerror_msi.attr, + + // Page 0 read of EVENTQ_IRQ_CFG1 + registers::EVENTQ_IRQ_CFG1 => self.evtq_msi.data, + registers::EVENTQ_IRQ_CFG2 => self.evtq_msi.attr, + + _ => { + tracelimit::warn_ratelimited!(offset, "smmu: unhandled 32-bit MMIO read"); + 0 + } + } + } + + /// Handles a 64-bit MMIO read at the given offset from the device base. + fn read_reg64(&self, offset: u32) -> u64 { + match offset as u16 { + registers::STRTAB_BASE => self.strtab_base, + registers::CMDQ_BASE => self.cmdq_base, + registers::EVENTQ_BASE => self.evtq_base, + registers::GERROR_IRQ_CFG0 => self.gerror_msi.addr, + registers::EVENTQ_IRQ_CFG0 => self.evtq_msi.addr, + _ => { + tracelimit::warn_ratelimited!(offset, "smmu: unhandled 64-bit MMIO read"); + 0 + } + } + } + + /// Handles a 32-bit MMIO write at the given offset. + fn write_reg32(&mut self, offset: u32, value: u32) { + match offset as u16 { + // Read-only registers: ignore writes. + registers::IDR0 + | registers::IDR1 + | registers::IDR2 + | registers::IDR3 + | registers::IDR4 + | registers::IDR5 + | registers::IIDR + | registers::AIDR + | registers::CR0ACK + | registers::STATUSR + | registers::IRQ_CTRLACK => {} + + registers::CR0 => { + self.cr0 = registers::Cr0::from(value); + // Immediate acknowledge — no async enable sequence. + self.cr0ack = self.cr0; + // Sync enable state to shared state for per-device wrappers. + self.shared_state.set_enabled(self.cr0.smmuen()); + self.shared_state.set_evtq_enabled(self.cr0.eventqen()); + } + registers::CR1 => { + self.cr1 = registers::Cr1::from(value); + } + registers::CR2 => { + self.cr2 = registers::Cr2::from(value); + } + registers::GBPA => { + // Clear the UPDATE bit on write (the SMMU "completes" the + // update immediately). + let mut gbpa = registers::Gbpa::from(value); + gbpa.set_update(false); + self.gbpa = gbpa; + } + registers::IRQ_CTRL => { + self.irq_ctrl = registers::IrqCtrl::from(value); + // Immediate acknowledge. + self.irq_ctrlack = self.irq_ctrl; + self.shared_state + .set_irq_ctrl(self.irq_ctrl.eventq_irqen(), self.irq_ctrl.gerror_irqen()); + } + registers::GERRORN => { + self.shared_state.write_gerrorn(value); + } + + registers::STRTAB_BASE_CFG => { + self.strtab_base_cfg = registers::StrtabBaseCfg::from(value); + self.sync_strtab_to_shared(); + } + + registers::CMDQ_PROD => { + self.cmdq_prod = value; + self.process_cmdq(); + } + registers::CMDQ_CONS => { + // CMDQ_CONS is writable by the SMMU only (for error reporting). + // Guest writes are ignored per spec. + } + + registers::GERROR_IRQ_CFG1 => self.gerror_msi.data = value, + registers::GERROR_IRQ_CFG2 => self.gerror_msi.attr = value, + + registers::EVENTQ_IRQ_CFG1 => self.evtq_msi.data = value, + registers::EVENTQ_IRQ_CFG2 => self.evtq_msi.attr = value, + + _ => { + tracelimit::warn_ratelimited!(offset, value, "smmu: unhandled 32-bit MMIO write"); + } + } + } + + /// Handles a 64-bit MMIO write at the given offset. + fn write_reg64(&mut self, offset: u32, value: u64) { + match offset as u16 { + registers::STRTAB_BASE => { + self.strtab_base = value; + self.sync_strtab_to_shared(); + } + registers::CMDQ_BASE => { + self.cmdq_base = value; + } + registers::EVENTQ_BASE => { + self.evtq_base = value; + self.sync_evtq_to_shared(); + } + registers::GERROR_IRQ_CFG0 => self.gerror_msi.addr = value, + registers::EVENTQ_IRQ_CFG0 => self.evtq_msi.addr = value, + + _ => { + tracelimit::warn_ratelimited!(offset, value, "smmu: unhandled 64-bit MMIO write"); + } + } + } + + /// Handles page 1 register reads (offset >= 0x10000). + fn read_page1_reg32(&self, offset: u32) -> u32 { + match offset { + registers::EVENTQ_PROD_PAGE1 => self.shared_state.evtq_prod(), + registers::EVENTQ_CONS_PAGE1 => self.shared_state.evtq_cons(), + registers::CMDQ_IRQ_CFG1_PAGE1 => self.cmdq_msi.data, + registers::CMDQ_IRQ_CFG2_PAGE1 => self.cmdq_msi.attr, + _ => { + tracelimit::warn_ratelimited!(offset, "smmu: unhandled page 1 32-bit MMIO read"); + 0 + } + } + } + + /// Handles page 1 register reads (64-bit, offset >= 0x10000). + fn read_page1_reg64(&self, offset: u32) -> u64 { + match offset { + registers::CMDQ_IRQ_CFG0_PAGE1 => self.cmdq_msi.addr, + _ => { + tracelimit::warn_ratelimited!(offset, "smmu: unhandled page 1 64-bit MMIO read"); + 0 + } + } + } + + /// Handles page 1 register writes (offset >= 0x10000). + fn write_page1_reg32(&mut self, offset: u32, value: u32) { + match offset { + registers::EVENTQ_PROD_PAGE1 => { + // EVTQ_PROD on page 1 is writable by the SMMU only (for + // writing events). Guest writes are ignored. + } + registers::EVENTQ_CONS_PAGE1 => { + self.shared_state.set_evtq_cons(value); + } + registers::CMDQ_IRQ_CFG1_PAGE1 => self.cmdq_msi.data = value, + registers::CMDQ_IRQ_CFG2_PAGE1 => self.cmdq_msi.attr = value, + _ => { + tracelimit::warn_ratelimited!( + offset, + value, + "smmu: unhandled page 1 32-bit MMIO write" + ); + } + } + } + + /// Handles page 1 register writes (64-bit, offset >= 0x10000). + fn write_page1_reg64(&mut self, offset: u32, value: u64) { + match offset { + registers::CMDQ_IRQ_CFG0_PAGE1 => self.cmdq_msi.addr = value, + _ => { + tracelimit::warn_ratelimited!( + offset, + value, + "smmu: unhandled page 1 64-bit MMIO write" + ); + } + } + } + + // ========================================================================= + // Shared State Synchronization + // ========================================================================= + + /// Sync the stream table base address and size to the shared state. + fn sync_strtab_to_shared(&self) { + let base = registers::StrtabBase::from(self.strtab_base).addr(); + let log2size = self.strtab_base_cfg.log2size(); + self.shared_state.set_strtab(base, log2size); + } + + /// Sync the event queue base address and size to the shared state. + fn sync_evtq_to_shared(&self) { + let base_addr = registers::QueueBase::from(self.evtq_base).addr(); + let raw_log2size = registers::QueueBase::from(self.evtq_base).log2size(); + let log2size = raw_log2size.min(self.idr1.eventqs()); + self.shared_state.set_evtq_config(base_addr, log2size); + } + + // ========================================================================= + // Command Queue Processing + // ========================================================================= + + /// Returns the log2 size of the command queue from CMDQ_BASE, + /// clamped to the maximum advertised in IDR1.CMDQS. + fn cmdq_log2size(&self) -> u8 { + let raw = registers::QueueBase::from(self.cmdq_base).log2size(); + let max = self.idr1.cmdqs(); + raw.min(max) + } + + /// Returns the base GPA of the command queue from CMDQ_BASE. + fn cmdq_base_addr(&self) -> u64 { + registers::QueueBase::from(self.cmdq_base).addr() + } + + /// Checks if CMDQ processing is enabled (CMDQEN set and SMMU enabled + /// or at least CMDQEN in CR0). + fn cmdq_enabled(&self) -> bool { + self.cr0.cmdqen() + } + + /// Returns true if the CMDQ has a pending (active, unacknowledged) error. + fn cmdq_has_error(&self) -> bool { + self.shared_state.cmdq_err_active() + } + + /// Process all pending commands in the command queue. + /// + /// Called when the guest writes CMDQ_PROD. Consumes commands from + /// CMDQ_CONS up to CMDQ_PROD, dispatching each by opcode. + fn process_cmdq(&mut self) { + if !self.cmdq_enabled() { + return; + } + + // Don't process if there's an outstanding CMDQ error. + if self.cmdq_has_error() { + return; + } + + let log2size = self.cmdq_log2size() as u32; + let max_entries = 1u32 << log2size; + // The wrap mask includes the wrap bit: (2 * max_entries - 1). + let index_mask = (max_entries << 1) - 1; + let base_addr = self.cmdq_base_addr(); + + // Extract the raw cons value (bits [19:0] include the wrap bit). + let mut cons = self.cmdq_cons.rd(); + let prod = self.cmdq_prod & index_mask; + + // Limit iterations to prevent infinite loops on malformed state. + let mut iterations = 0u32; + + while cons != prod { + if iterations >= max_entries { + // Safety valve: should never happen with well-behaved software. + tracelimit::warn_ratelimited!("smmu: CMDQ processing exceeded max iterations"); + break; + } + iterations += 1; + + // Compute the entry address: index within the queue (without wrap bit). + let index = cons & (max_entries - 1); + let entry_addr = base_addr + (index as u64) * (size_of::() as u64); + + // Read the 16-byte command entry from guest memory. + let entry = match self.guest_memory.read_plain::(entry_addr) { + Ok(entry) => entry, + Err(e) => { + tracelimit::warn_ratelimited!( + error = &e as &dyn std::error::Error, + entry_addr, + "smmu: failed to read CMDQ entry from guest memory" + ); + // Set CMDQ error: abort. + self.set_cmdq_error(registers::CmdqError::CERROR_ABT); + break; + } + }; + + match entry.opcode() { + // Configuration invalidation commands — no-op (no cache yet). + CmdOpcode::PREFETCH_CFG + | CmdOpcode::CFGI_STE + | CmdOpcode::CFGI_STE_RANGE + | CmdOpcode::CFGI_CD + | CmdOpcode::CFGI_CD_ALL => {} + + // TLB invalidation commands — no-op (no TLB yet). + CmdOpcode::TLBI_NH_ALL + | CmdOpcode::TLBI_NH_ASID + | CmdOpcode::TLBI_NH_VA + | CmdOpcode::TLBI_NH_VAA + | CmdOpcode::TLBI_S12_VMALL + | CmdOpcode::TLBI_NSNH_ALL => {} + + // Synchronization command. + CmdOpcode::CMD_SYNC => { + self.handle_cmd_sync(&entry); + } + + // Unknown opcode — set CMDQ error. + opcode => { + tracelimit::warn_ratelimited!(?opcode, "smmu: unknown CMDQ opcode"); + self.set_cmdq_error(registers::CmdqError::CERROR_ILL); + break; + } + } + + // Advance cons with wrap. + cons = (cons + 1) & index_mask; + } + + // Update the stored CMDQ_CONS (preserve error field, update rd). + self.cmdq_cons.set_rd(cons); + } + + /// Handle a CMD_SYNC command. + /// + /// With IDR0.MSI=0, Linux uses CS=SIG_SEV and polls CMDQ_CONS for + /// completion. The MSIWrite path is kept for spec compliance but won't + /// be exercised by Linux when MSI is not advertised. + fn handle_cmd_sync(&mut self, entry: &CmdEntry) { + let cmd = CmdSync::from(entry.qw0); + let cs = SyncCs(cmd.cs()); + + match cs { + SyncCs::SIG_NONE | SyncCs::SIG_SEV => { + // No signal or SEV — nothing to do. Linux polls CMDQ_CONS. + } + SyncCs::SIG_IRQ => { + // Write MSI data to MSI address in guest memory (RAM polling). + let msi_addr = CmdSync::msi_write_addr_from_entry(entry); + let msi_data = cmd.msi_data(); + + if msi_addr != 0 { + if let Err(e) = self + .guest_memory + .write_at(msi_addr, &msi_data.to_le_bytes()) + { + tracelimit::warn_ratelimited!( + error = &e as &dyn std::error::Error, + msi_addr, + "smmu: failed to write CMD_SYNC MSI to guest memory" + ); + } + } + } + _ => { + tracelimit::warn_ratelimited!(cs = cs.0, "smmu: unknown CMD_SYNC CS value"); + } + } + } + + /// Set a command queue error, toggling GERROR.CMDQ_ERR and storing the + /// error code in CMDQ_CONS. + fn set_cmdq_error(&mut self, error: registers::CmdqError) { + // Set error code in CMDQ_CONS. + self.cmdq_cons.set_err(error.0); + // Toggle GERROR.CMDQ_ERR and update interrupt line (atomic). + self.shared_state.toggle_cmdq_err(); + } + + // ========================================================================= + // Event Queue + // ========================================================================= +} + +impl ChipsetDevice for SmmuDevice { + fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> { + Some(self) + } +} + +impl ChangeDeviceState for SmmuDevice { + fn start(&mut self) {} + + async fn stop(&mut self) {} + + async fn reset(&mut self) { + let SmmuDevice { + // Static configuration — not reset. + mmio_region: _, + mmio_base: _, + guest_memory: _, + shared_state, + + // Identification registers — read-only, not reset. + idr0: _, + idr1: _, + idr2: _, + idr3: _, + idr4: _, + idr5: _, + iidr: _, + aidr: _, + + // Control registers — reset to power-on defaults. + cr0, + cr0ack, + cr1, + cr2, + gbpa, + + // Interrupt control. + irq_ctrl, + irq_ctrlack, + + // Stream table base. + strtab_base, + strtab_base_cfg, + + // Command queue. + cmdq_base, + cmdq_prod, + cmdq_cons, + + // Event queue base register. + evtq_base, + + // MSI configuration. + gerror_msi, + evtq_msi, + cmdq_msi, + } = self; + + *cr0 = registers::Cr0::new(); + *cr0ack = registers::Cr0::new(); + *cr1 = registers::Cr1::new(); + *cr2 = registers::Cr2::new(); + *gbpa = registers::Gbpa::new().with_abort(true); + + *irq_ctrl = registers::IrqCtrl::new(); + *irq_ctrlack = registers::IrqCtrl::new(); + + *strtab_base = 0; + *strtab_base_cfg = registers::StrtabBaseCfg::new(); + + *cmdq_base = 0; + *cmdq_prod = 0; + *cmdq_cons = registers::CmdqCons::new(); + + *evtq_base = 0; + + *gerror_msi = MsiConfig::default(); + *evtq_msi = MsiConfig::default(); + *cmdq_msi = MsiConfig::default(); + + // Sync disabled state to shared state so per-device wrappers + // bypass translation immediately. + shared_state.set_enabled(false); + shared_state.set_strtab(0, 0); + // Reset EVTQ state (prod, cons, config, enabled). + // Reset GERROR state and deassert interrupt. + shared_state.reset_queue_state(); + } +} + +impl SaveRestore for SmmuDevice { + type SavedState = SavedStateNotSupported; + + fn save(&mut self) -> Result { + Err(SaveError::NotSupported) + } + + fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> { + match state {} + } +} + +impl MmioIntercept for SmmuDevice { + fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult { + let offset = (addr - self.mmio_base) as u32; + + if offset >= 0x10000 { + // Page 1 register access. + match data.len() { + 4 => { + let value = self.read_page1_reg32(offset); + data.copy_from_slice(&value.to_le_bytes()); + } + 8 => { + let value = self.read_page1_reg64(offset); + data.copy_from_slice(&value.to_le_bytes()); + } + _ => return IoResult::Err(IoError::InvalidAccessSize), + } + } else { + // Page 0 register access. + match data.len() { + 4 => { + let value = self.read_reg32(offset); + data.copy_from_slice(&value.to_le_bytes()); + } + 8 => { + let value = self.read_reg64(offset); + data.copy_from_slice(&value.to_le_bytes()); + } + _ => return IoResult::Err(IoError::InvalidAccessSize), + } + } + + IoResult::Ok + } + + fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult { + let offset = (addr - self.mmio_base) as u32; + + if offset >= 0x10000 { + // Page 1 register access. + match data.len() { + 4 => { + let value = u32::from_le_bytes(data.try_into().unwrap()); + self.write_page1_reg32(offset, value); + } + 8 => { + let value = u64::from_le_bytes(data.try_into().unwrap()); + self.write_page1_reg64(offset, value); + } + _ => return IoResult::Err(IoError::InvalidAccessSize), + } + } else { + // Page 0 register access. + match data.len() { + 4 => { + let value = u32::from_le_bytes(data.try_into().unwrap()); + self.write_reg32(offset, value); + } + 8 => { + let value = u64::from_le_bytes(data.try_into().unwrap()); + self.write_reg64(offset, value); + } + _ => return IoResult::Err(IoError::InvalidAccessSize), + } + } + + IoResult::Ok + } + + fn get_static_regions(&mut self) -> &[(&str, RangeInclusive)] { + std::slice::from_ref(&self.mmio_region) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::spec::events::EvtEntry; + use crate::spec::registers::*; + + const TEST_MMIO_BASE: u64 = 0x0900_0000; + + fn make_test_device() -> SmmuDevice { + let gm = GuestMemory::empty(); + SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None) + } + + /// Helper to read a 32-bit register. + fn read32(dev: &mut SmmuDevice, reg_offset: u16) -> u32 { + let mut data = [0u8; 4]; + let result = dev.mmio_read(TEST_MMIO_BASE + reg_offset as u64, &mut data); + assert!(matches!(result, IoResult::Ok)); + u32::from_le_bytes(data) + } + + /// Helper to write a 32-bit register. + fn write32(dev: &mut SmmuDevice, reg_offset: u16, value: u32) { + let data = value.to_le_bytes(); + let result = dev.mmio_write(TEST_MMIO_BASE + reg_offset as u64, &data); + assert!(matches!(result, IoResult::Ok)); + } + + /// Helper to read a 64-bit register. + fn read64(dev: &mut SmmuDevice, reg_offset: u16) -> u64 { + let mut data = [0u8; 8]; + let result = dev.mmio_read(TEST_MMIO_BASE + reg_offset as u64, &mut data); + assert!(matches!(result, IoResult::Ok)); + u64::from_le_bytes(data) + } + + /// Helper to write a 64-bit register. + fn write64(dev: &mut SmmuDevice, reg_offset: u16, value: u64) { + let data = value.to_le_bytes(); + let result = dev.mmio_write(TEST_MMIO_BASE + reg_offset as u64, &data); + assert!(matches!(result, IoResult::Ok)); + } + + /// Helper to read a 32-bit page 1 register (offset >= 0x10000). + fn read32_page1(dev: &mut SmmuDevice, abs_offset: u32) -> u32 { + let mut data = [0u8; 4]; + let result = dev.mmio_read(TEST_MMIO_BASE + abs_offset as u64, &mut data); + assert!(matches!(result, IoResult::Ok)); + u32::from_le_bytes(data) + } + + /// Helper to write a 32-bit page 1 register. + fn write32_page1(dev: &mut SmmuDevice, abs_offset: u32, value: u32) { + let data = value.to_le_bytes(); + let result = dev.mmio_write(TEST_MMIO_BASE + abs_offset as u64, &data); + assert!(matches!(result, IoResult::Ok)); + } + + /// Helper to read a 64-bit page 1 register. + fn read64_page1(dev: &mut SmmuDevice, abs_offset: u32) -> u64 { + let mut data = [0u8; 8]; + let result = dev.mmio_read(TEST_MMIO_BASE + abs_offset as u64, &mut data); + assert!(matches!(result, IoResult::Ok)); + u64::from_le_bytes(data) + } + + /// Helper to write a 64-bit page 1 register. + fn write64_page1(dev: &mut SmmuDevice, abs_offset: u32, value: u64) { + let data = value.to_le_bytes(); + let result = dev.mmio_write(TEST_MMIO_BASE + abs_offset as u64, &data); + assert!(matches!(result, IoResult::Ok)); + } + + #[test] + fn test_idr_readback() { + let mut dev = make_test_device(); + + // IDR0: S1P=1, TTF=0b10, COHACC=1, ASID16=1, MSI=1, TTENDIAN=0b10, + // ST_LVL=0b00 + let idr0 = Idr0::from(read32(&mut dev, IDR0)); + assert!(idr0.s1p()); + assert!(!idr0.s2p()); + assert_eq!(idr0.ttf(), 0b10); + assert!(idr0.cohacc()); + assert!(idr0.asid16()); + assert!(!idr0.msi()); + assert_eq!(idr0.ttendian(), 0b10); + assert_eq!(idr0.st_level(), 0b00); + + // IDR1: SIDSIZE=16, CMDQS=8, EVTQS=8, ATTR_TYPES_OVR=1 + let idr1 = Idr1::from(read32(&mut dev, IDR1)); + assert_eq!(idr1.sidsize(), 16); + assert_eq!(idr1.cmdqs(), 8); + assert_eq!(idr1.eventqs(), 8); + assert!(idr1.attr_types_ovr()); + assert!(!idr1.tables_preset()); + assert!(!idr1.queues_preset()); + assert!(!idr1.rel()); + + // IDR2, IDR3, IDR4 = 0 + assert_eq!(read32(&mut dev, IDR2), 0); + assert_eq!(read32(&mut dev, IDR3), 0); + assert_eq!(read32(&mut dev, IDR4), 0); + + // IDR5: GRAN4K=1, OAS=0b010 (40-bit) + let idr5 = Idr5::from(read32(&mut dev, IDR5)); + assert!(idr5.gran4k()); + assert!(!idr5.gran16k()); + assert!(!idr5.gran64k()); + assert_eq!(idr5.oas(), 0b010); + + // IIDR = 0 + assert_eq!(read32(&mut dev, IIDR), 0); + + // AIDR = 0x03 (SMMUv3.3) + assert_eq!(read32(&mut dev, AIDR), 0x03); + } + + #[test] + fn test_cr0_ack_echo() { + let mut dev = make_test_device(); + + // Write CR0 with all enable bits. + let cr0_val = Cr0::new() + .with_smmuen(true) + .with_cmdqen(true) + .with_eventqen(true); + write32(&mut dev, CR0, cr0_val.into()); + + // CR0ACK should match. + let ack = read32(&mut dev, CR0ACK); + assert_eq!(ack, u32::from(cr0_val)); + } + + #[test] + fn test_cr0_enable_sequence() { + let mut dev = make_test_device(); + + // Step 1: Enable CMDQ. + let cr0_cmdq = Cr0::new().with_cmdqen(true); + write32(&mut dev, CR0, cr0_cmdq.into()); + let ack = Cr0::from(read32(&mut dev, CR0ACK)); + assert!(ack.cmdqen()); + assert!(!ack.eventqen()); + assert!(!ack.smmuen()); + + // Step 2: Enable EVTQ. + let cr0_evtq = cr0_cmdq.with_eventqen(true); + write32(&mut dev, CR0, cr0_evtq.into()); + let ack = Cr0::from(read32(&mut dev, CR0ACK)); + assert!(ack.cmdqen()); + assert!(ack.eventqen()); + assert!(!ack.smmuen()); + + // Step 3: Enable SMMU. + let cr0_full = cr0_evtq.with_smmuen(true); + write32(&mut dev, CR0, cr0_full.into()); + let ack = Cr0::from(read32(&mut dev, CR0ACK)); + assert!(ack.cmdqen()); + assert!(ack.eventqen()); + assert!(ack.smmuen()); + } + + #[test] + fn test_strtab_base_readback() { + let mut dev = make_test_device(); + + // Write a 64-bit STRTAB_BASE with address and RA hint. + let base = StrtabBase::new() + .with_addr_bits(0x1234_5678_9AB0u64 >> 6) + .with_ra(true); + write64(&mut dev, STRTAB_BASE, base.into()); + + let readback = StrtabBase::from(read64(&mut dev, STRTAB_BASE)); + assert_eq!(readback.addr(), base.addr()); + assert!(readback.ra()); + + // Write STRTAB_BASE_CFG. + let cfg = StrtabBaseCfg::new().with_log2size(10).with_fmt(0); + write32(&mut dev, STRTAB_BASE_CFG, cfg.into()); + let readback_cfg = StrtabBaseCfg::from(read32(&mut dev, STRTAB_BASE_CFG)); + assert_eq!(readback_cfg.log2size(), 10); + assert_eq!(readback_cfg.fmt(), 0); + } + + #[test] + fn test_irq_ctrl_ack() { + let mut dev = make_test_device(); + + let ctrl = IrqCtrl::new() + .with_eventq_irqen(true) + .with_gerror_irqen(true); + write32(&mut dev, IRQ_CTRL, ctrl.into()); + + let ack = IrqCtrl::from(read32(&mut dev, IRQ_CTRLACK)); + assert!(ack.eventq_irqen()); + assert!(ack.gerror_irqen()); + } + + #[test] + fn test_gbpa_update_bit() { + let mut dev = make_test_device(); + + // Write GBPA with UPDATE=1 and ABORT=0. + let gbpa = Gbpa::new().with_update(true).with_abort(false); + write32(&mut dev, GBPA, gbpa.into()); + + // Read back: UPDATE should be cleared, ABORT should be 0. + let readback = Gbpa::from(read32(&mut dev, GBPA)); + assert!(!readback.update()); + assert!(!readback.abort()); + } + + #[test] + fn test_page1_register_access() { + let mut dev = make_test_device(); + + // EVTQ_CONS on page 1 is guest-writable. + write32_page1(&mut dev, EVENTQ_CONS_PAGE1, 42); + assert_eq!(read32_page1(&mut dev, EVENTQ_CONS_PAGE1), 42); + + // EVTQ_PROD on page 1 is SMMU-writable only (guest writes ignored). + write32_page1(&mut dev, EVENTQ_PROD_PAGE1, 99); + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 0); + } + + #[test] + fn test_readonly_regs_ignore_writes() { + let mut dev = make_test_device(); + + let original_idr0 = read32(&mut dev, IDR0); + write32(&mut dev, IDR0, 0xDEAD_BEEF); + assert_eq!(read32(&mut dev, IDR0), original_idr0); + + let original_aidr = read32(&mut dev, AIDR); + write32(&mut dev, AIDR, 0xCAFE); + assert_eq!(read32(&mut dev, AIDR), original_aidr); + + // CR0ACK is read-only. + write32(&mut dev, CR0ACK, 0xFFFF_FFFF); + assert_eq!(read32(&mut dev, CR0ACK), 0); + + // IRQ_CTRLACK is read-only. + write32(&mut dev, IRQ_CTRLACK, 0xFFFF_FFFF); + assert_eq!(read32(&mut dev, IRQ_CTRLACK), 0); + } + + #[test] + fn test_cmdq_base_readback() { + let mut dev = make_test_device(); + + let base = QueueBase::new() + .with_log2size(8) + .with_addr_bits(0x8000_0000u64 >> 5); + write64(&mut dev, CMDQ_BASE, base.into()); + let readback = QueueBase::from(read64(&mut dev, CMDQ_BASE)); + assert_eq!(readback.log2size(), 8); + assert_eq!(readback.addr(), base.addr()); + } + + #[test] + fn test_evtq_base_readback() { + let mut dev = make_test_device(); + + let base = QueueBase::new() + .with_log2size(8) + .with_addr_bits(0xA000_0000u64 >> 5); + write64(&mut dev, EVENTQ_BASE, base.into()); + let readback = QueueBase::from(read64(&mut dev, EVENTQ_BASE)); + assert_eq!(readback.log2size(), 8); + assert_eq!(readback.addr(), base.addr()); + } + + #[test] + fn test_gerror_gerrorn_toggle() { + let mut dev = make_test_device(); + + // Initially GERROR = GERRORN = 0 (no active errors). + assert_eq!(read32(&mut dev, GERROR), 0); + assert_eq!(read32(&mut dev, GERRORN), 0); + + // Toggle CMDQ_ERR via shared state (as the emulator would). + dev.shared_state.toggle_cmdq_err(); + let gerror = Gerror::from(read32(&mut dev, GERROR)); + assert!(gerror.cmdq_err()); + + // Guest acknowledges by writing GERRORN to match GERROR. + write32(&mut dev, GERRORN, gerror.into()); + let gerrorn = Gerror::from(read32(&mut dev, GERRORN)); + assert!(gerrorn.cmdq_err()); + } + + #[test] + fn test_msi_config_registers() { + let mut dev = make_test_device(); + + // GERROR MSI config (page 0). + write64(&mut dev, GERROR_IRQ_CFG0, 0xFEDC_BA98_7654_3210); + assert_eq!(read64(&mut dev, GERROR_IRQ_CFG0), 0xFEDC_BA98_7654_3210); + write32(&mut dev, GERROR_IRQ_CFG1, 0xAABB_CCDD); + assert_eq!(read32(&mut dev, GERROR_IRQ_CFG1), 0xAABB_CCDD); + write32(&mut dev, GERROR_IRQ_CFG2, 0x0000_000F); + assert_eq!(read32(&mut dev, GERROR_IRQ_CFG2), 0x0000_000F); + + // EVENTQ MSI config (page 0). + write64(&mut dev, EVENTQ_IRQ_CFG0, 0x1111_2222_3333_4444); + assert_eq!(read64(&mut dev, EVENTQ_IRQ_CFG0), 0x1111_2222_3333_4444); + write32(&mut dev, EVENTQ_IRQ_CFG1, 0x5555_6666); + assert_eq!(read32(&mut dev, EVENTQ_IRQ_CFG1), 0x5555_6666); + write32(&mut dev, EVENTQ_IRQ_CFG2, 0x0000_0003); + assert_eq!(read32(&mut dev, EVENTQ_IRQ_CFG2), 0x0000_0003); + + // CMDQ MSI config (page 1). + write64_page1(&mut dev, CMDQ_IRQ_CFG0_PAGE1, 0xAAAA_BBBB_CCCC_DDDD); + assert_eq!( + read64_page1(&mut dev, CMDQ_IRQ_CFG0_PAGE1), + 0xAAAA_BBBB_CCCC_DDDD + ); + write32_page1(&mut dev, CMDQ_IRQ_CFG1_PAGE1, 0x1234_5678); + assert_eq!(read32_page1(&mut dev, CMDQ_IRQ_CFG1_PAGE1), 0x1234_5678); + write32_page1(&mut dev, CMDQ_IRQ_CFG2_PAGE1, 0x0000_0007); + assert_eq!(read32_page1(&mut dev, CMDQ_IRQ_CFG2_PAGE1), 0x0000_0007); + } + + #[test] + fn test_invalid_access_size() { + let mut dev = make_test_device(); + + // 1-byte read should fail. + let mut data = [0u8; 1]; + let result = dev.mmio_read(TEST_MMIO_BASE, &mut data); + assert!(matches!(result, IoResult::Err(IoError::InvalidAccessSize))); + + // 1-byte write should fail. + let result = dev.mmio_write(TEST_MMIO_BASE, &[0u8]); + assert!(matches!(result, IoResult::Err(IoError::InvalidAccessSize))); + + // 3-byte read should fail. + let mut data = [0u8; 3]; + let result = dev.mmio_read(TEST_MMIO_BASE, &mut data); + assert!(matches!(result, IoResult::Err(IoError::InvalidAccessSize))); + } + + #[test] + fn test_cr1_cr2_readback() { + let mut dev = make_test_device(); + + let cr1 = Cr1::new() + .with_queue_ic(0b01) + .with_queue_oc(0b01) + .with_queue_sh(0b11) + .with_table_ic(0b01) + .with_table_oc(0b01) + .with_table_sh(0b11); + write32(&mut dev, CR1, cr1.into()); + let readback = Cr1::from(read32(&mut dev, CR1)); + assert_eq!(readback.queue_ic(), 0b01); + assert_eq!(readback.table_sh(), 0b11); + + let cr2 = Cr2::new().with_recinvsid(true); + write32(&mut dev, CR2, cr2.into()); + let readback = Cr2::from(read32(&mut dev, CR2)); + assert!(readback.recinvsid()); + } + + #[test] + fn test_cmdq_prod_readback() { + let mut dev = make_test_device(); + + write32(&mut dev, CMDQ_PROD, 0x0000_0005); + assert_eq!(read32(&mut dev, CMDQ_PROD), 0x0000_0005); + } + + // ========================================================================= + // CMDQ processing tests + // ========================================================================= + + /// Size of the test CMDQ: 2^3 = 8 entries. + const TEST_CMDQ_LOG2SIZE: u8 = 3; + /// GPA where the test CMDQ lives. + const TEST_CMDQ_GPA: u64 = 0x1_0000; + /// GPA where CMD_SYNC MSI writes go. + const TEST_MSI_GPA: u64 = 0x2_0000; + + /// Create a device with real guest memory and a configured CMDQ. + fn make_cmdq_test_device() -> SmmuDevice { + // Allocate enough guest memory for CMDQ + MSI target page. + let gm = GuestMemory::allocate(0x4_0000); + let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None); + + // Program CMDQ_BASE: address + log2size. + let cmdq_base = QueueBase::new() + .with_log2size(TEST_CMDQ_LOG2SIZE) + .with_addr_bits(TEST_CMDQ_GPA >> 5); + write64(&mut dev, CMDQ_BASE, cmdq_base.into()); + + // Enable CMDQEN. + let cr0 = Cr0::new().with_cmdqen(true); + write32(&mut dev, CR0, cr0.into()); + + dev + } + + /// Write a command entry to the CMDQ at the given index. + fn write_cmdq_entry(dev: &SmmuDevice, index: u32, entry: &CmdEntry) { + let addr = TEST_CMDQ_GPA + (index as u64) * (size_of::() as u64); + dev.guest_memory + .write_plain(addr, entry) + .expect("write cmd entry"); + } + + #[test] + fn test_cmdq_basic_consumption() { + let mut dev = make_cmdq_test_device(); + + // Write 3 commands: CFGI_STE_RANGE (CFGI_ALL), TLBI_NSNH_ALL, CMD_SYNC(SEV). + write_cmdq_entry( + &dev, + 0, + &CmdEntry { + qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64, + qw1: 31, // Range=31 = ALL + }, + ); + write_cmdq_entry( + &dev, + 1, + &CmdEntry { + qw0: CmdOpcode::TLBI_NSNH_ALL.0 as u64, + qw1: 0, + }, + ); + let sync = CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_SEV.0); + write_cmdq_entry( + &dev, + 2, + &CmdEntry { + qw0: sync.into(), + qw1: 0, + }, + ); + + // Set PROD=3, triggering processing. + write32(&mut dev, CMDQ_PROD, 3); + + // Verify CONS=3. + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 3); + assert_eq!(cons.err(), 0); + } + + #[test] + fn test_cmdq_sync_msi_write() { + let mut dev = make_cmdq_test_device(); + + let msi_data: u32 = 0xDEAD_BEEF; + let msi_addr: u64 = TEST_MSI_GPA; + + // Build CMD_SYNC with CS=SIG_IRQ and MSI address/data. + let sync = CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_IRQ.0) + .with_msi_data(msi_data); + // MSI address goes in qw1 bits [119:66] → addr[55:2] at bits [53:0] + // shifted left by 2 in qw1. + let qw1 = (msi_addr >> 2) << 2; + write_cmdq_entry( + &dev, + 0, + &CmdEntry { + qw0: sync.into(), + qw1, + }, + ); + + // Set PROD=1. + write32(&mut dev, CMDQ_PROD, 1); + + // Verify CONS=1. + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 1); + + // Verify MSI data written to the target GPA. + let written: u32 = dev + .guest_memory + .read_plain(msi_addr) + .expect("read MSI data"); + assert_eq!(written, msi_data); + } + + #[test] + fn test_cmdq_wrap() { + let mut dev = make_cmdq_test_device(); + + let max_entries = 1u32 << TEST_CMDQ_LOG2SIZE; // 8 + + // Fill the queue completely: 8 CFGI_STE_RANGE commands. + for i in 0..max_entries { + write_cmdq_entry( + &dev, + i, + &CmdEntry { + qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64, + qw1: 31, + }, + ); + } + + // Set PROD = 8 (which with wrap bit means index 0 with wrap=1). + write32(&mut dev, CMDQ_PROD, max_entries); + + // CONS should advance to 8 (matching PROD with wrap). + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), max_entries); + assert_eq!(cons.err(), 0); + + // Now write one more command at index 0 (wrapping around). + write_cmdq_entry( + &dev, + 0, + &CmdEntry { + qw0: CmdOpcode::TLBI_NH_ALL.0 as u64, + qw1: 0, + }, + ); + + // PROD = 9 (wrap bit set, index 1). + write32(&mut dev, CMDQ_PROD, max_entries + 1); + + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), max_entries + 1); + } + + #[test] + fn test_cmdq_unknown_opcode() { + let mut dev = make_cmdq_test_device(); + + // Write a command with unknown opcode 0xFF. + write_cmdq_entry(&dev, 0, &CmdEntry { qw0: 0xFF, qw1: 0 }); + + write32(&mut dev, CMDQ_PROD, 1); + + // CONS should have CERROR_ILL in the error field. + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.err(), CmdqError::CERROR_ILL.0); + + // GERROR.CMDQ_ERR should be toggled (was 0, now 1). + let gerror = Gerror::from(read32(&mut dev, GERROR)); + assert!(gerror.cmdq_err()); + } + + #[test] + fn test_cmdq_log2size_clamped_to_idr1() { + let gm = GuestMemory::allocate(0x4_0000); + let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None); + + // IDR1.CMDQS = 8, IDR1.EVENTQS = 8. Program a larger value (20). + let cmdq_base = QueueBase::new() + .with_log2size(20) + .with_addr_bits(TEST_CMDQ_GPA >> 5); + write64(&mut dev, CMDQ_BASE, cmdq_base.into()); + + // The effective log2size should be clamped to 8. + assert_eq!(dev.cmdq_log2size(), 8); + + // A value within the limit should pass through unchanged. + let cmdq_base = QueueBase::new() + .with_log2size(5) + .with_addr_bits(TEST_CMDQ_GPA >> 5); + write64(&mut dev, CMDQ_BASE, cmdq_base.into()); + assert_eq!(dev.cmdq_log2size(), 5); + } + + #[test] + fn test_cmdq_linux_reset_sequence() { + let mut dev = make_cmdq_test_device(); + + // Linux reset sequence: CFGI_ALL + CMD_SYNC, TLBI_NSNH_ALL + CMD_SYNC. + // Step 1: CFGI_ALL (CFGI_STE_RANGE with Range=31) + CMD_SYNC(SEV). + write_cmdq_entry( + &dev, + 0, + &CmdEntry { + qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64, + qw1: 31, + }, + ); + let sync = CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_SEV.0); + write_cmdq_entry( + &dev, + 1, + &CmdEntry { + qw0: sync.into(), + qw1: 0, + }, + ); + write32(&mut dev, CMDQ_PROD, 2); + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 2); + assert_eq!(cons.err(), 0); + + // Step 2: TLBI_NSNH_ALL + CMD_SYNC(SEV). + write_cmdq_entry( + &dev, + 2, + &CmdEntry { + qw0: CmdOpcode::TLBI_NSNH_ALL.0 as u64, + qw1: 0, + }, + ); + write_cmdq_entry( + &dev, + 3, + &CmdEntry { + qw0: sync.into(), + qw1: 0, + }, + ); + write32(&mut dev, CMDQ_PROD, 4); + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 4); + assert_eq!(cons.err(), 0); + + // No errors should be set. + let gerror = Gerror::from(read32(&mut dev, GERROR)); + assert!(!gerror.cmdq_err()); + } + + #[test] + fn test_cmdq_error_stops_processing() { + let mut dev = make_cmdq_test_device(); + + // Write: unknown opcode, then a valid command. + write_cmdq_entry( + &dev, + 0, + &CmdEntry { + qw0: 0xFF, // Unknown + qw1: 0, + }, + ); + write_cmdq_entry( + &dev, + 1, + &CmdEntry { + qw0: CmdOpcode::TLBI_NH_ALL.0 as u64, + qw1: 0, + }, + ); + + write32(&mut dev, CMDQ_PROD, 2); + + // CONS should be at 0 — processing stopped at the unknown command. + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 0); + assert_eq!(cons.err(), CmdqError::CERROR_ILL.0); + + // Even if we write more PROD, processing should not resume (error active). + write32(&mut dev, CMDQ_PROD, 2); + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 0); + + // Acknowledge the error by writing GERRORN to match GERROR. + let gerror = read32(&mut dev, GERROR); + write32(&mut dev, GERRORN, gerror); + + // Clear the error in CMDQ_CONS by resetting it internally. + // In practice, the guest would reprogram CMDQ_BASE and re-enable, + // but for this test we just verify the error flag blocks processing. + } + + #[test] + fn test_cmdq_disabled() { + // Create device but do NOT enable CMDQEN. + let gm = GuestMemory::allocate(0x4_0000); + let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None); + + let cmdq_base = QueueBase::new() + .with_log2size(TEST_CMDQ_LOG2SIZE) + .with_addr_bits(TEST_CMDQ_GPA >> 5); + write64(&mut dev, CMDQ_BASE, cmdq_base.into()); + + // Write a command and set PROD without enabling CMDQEN. + write_cmdq_entry( + &dev, + 0, + &CmdEntry { + qw0: CmdOpcode::TLBI_NH_ALL.0 as u64, + qw1: 0, + }, + ); + write32(&mut dev, CMDQ_PROD, 1); + + // CONS should stay at 0 — CMDQ is disabled. + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), 0); + } + + // ========================================================================= + // EVTQ tests + // ========================================================================= + + /// Size of the test EVTQ: 2^3 = 8 entries. + const TEST_EVTQ_LOG2SIZE: u8 = 3; + /// GPA where the test EVTQ lives. + const TEST_EVTQ_GPA: u64 = 0x3_0000; + /// GPA where the EVTQ MSI writes go. + const TEST_EVTQ_MSI_GPA: u64 = 0x2_0100; + + /// Create a device with EVTQ configured and enabled. + fn make_evtq_test_device() -> SmmuDevice { + let gm = GuestMemory::allocate(0x4_0000); + let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None); + + // Program EVTQ_BASE. + let evtq_base = QueueBase::new() + .with_log2size(TEST_EVTQ_LOG2SIZE) + .with_addr_bits(TEST_EVTQ_GPA >> 5); + write64(&mut dev, EVENTQ_BASE, evtq_base.into()); + + // Program EVTQ MSI config. + write64(&mut dev, EVENTQ_IRQ_CFG0, TEST_EVTQ_MSI_GPA); + write32(&mut dev, EVENTQ_IRQ_CFG1, 0xBEEF); + + // Enable EVTQEN + EVENTQ_IRQEN. + let cr0 = Cr0::new().with_eventqen(true); + write32(&mut dev, CR0, cr0.into()); + let irq_ctrl = IrqCtrl::new().with_eventq_irqen(true); + write32(&mut dev, IRQ_CTRL, irq_ctrl.into()); + + dev + } + + #[test] + fn test_evtq_write_and_read() { + let mut dev = make_evtq_test_device(); + + let event = EvtEntry::translation_fault(42, 0x1000_0000, false); + dev.shared_state().write_event(event); + + // EVTQ_PROD should advance to 1. + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 1); + + // Read the event record from guest memory. + let written: EvtEntry = dev + .guest_memory + .read_plain(TEST_EVTQ_GPA) + .expect("read event"); + assert_eq!( + written.event_id(), + crate::spec::events::EventId::F_TRANSLATION + ); + assert_eq!(written.sid, 42); + assert_eq!(written.input_addr, 0x1000_0000); + assert!(written.flags.rnw()); // read (rnw=true because write=false) + } + + #[test] + fn test_evtq_write_advances_prod() { + let mut dev = make_evtq_test_device(); + + // Write two events and verify PROD advances each time. + let event1 = EvtEntry::translation_fault(1, 0x2000, true); + dev.shared_state().write_event(event1); + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 1); + + let event2 = EvtEntry::translation_fault(2, 0x3000, false); + dev.shared_state().write_event(event2); + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 2); + + // Verify both events are in guest memory. + let e1: EvtEntry = dev.guest_memory.read_plain(TEST_EVTQ_GPA).expect("read"); + assert_eq!(e1.sid, 1); + let e2: EvtEntry = dev + .guest_memory + .read_plain(TEST_EVTQ_GPA + EvtEntry::SIZE as u64) + .expect("read"); + assert_eq!(e2.sid, 2); + } + + #[test] + fn test_evtq_full() { + let mut dev = make_evtq_test_device(); + + let max_entries = 1u32 << TEST_EVTQ_LOG2SIZE; // 8 + for i in 0..max_entries { + let event = EvtEntry::translation_fault(i, 0x1000 * i as u64, false); + dev.shared_state().write_event(event); + } + + // Queue should be full now. PROD = 8 (wrap), CONS = 0. + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), max_entries); + + // Writing one more should be dropped (queue full). + let event = EvtEntry::translation_fault(99, 0xDEAD, false); + dev.shared_state().write_event(event); + + // PROD should NOT advance (event dropped). + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), max_entries); + } + + #[test] + fn test_evtq_cons_frees_space() { + let mut dev = make_evtq_test_device(); + + let max_entries = 1u32 << TEST_EVTQ_LOG2SIZE; // 8 + for i in 0..max_entries { + let event = EvtEntry::translation_fault(i, 0x1000 * i as u64, false); + dev.shared_state().write_event(event); + } + + // Queue is full. Advance CONS to consume 3 entries. + write32_page1(&mut dev, EVENTQ_CONS_PAGE1, 3); + + // Should be able to write 3 more events. + for i in 0..3u32 { + let event = EvtEntry::translation_fault(100 + i, 0xF000, false); + dev.shared_state().write_event(event); + } + + // PROD should now be at 7 + 3 = 10 (with wrap). + assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), max_entries + 3); + } + + // ========================================================================= + // Sub-phase 1J: End-to-End Integration Test + // ========================================================================= + + /// End-to-end test that exercises the full SMMU stack: + /// MMIO register programming → command queue → stream table → context + /// descriptor → page table walk → translated DMA read/write → MSI + /// translation. + /// + /// Mimics the Linux SMMUv3 driver initialization sequence: + /// 1. Probe: read IDR registers, verify feature bits. + /// 2. Reset: disable SMMU, program CR1, stream table, queues, enable. + /// 3. Attach: configure STE and CD for a device. + /// 4. DMA: read/write through SmmuTranslatingMemory. + /// 5. MSI: fire MSI through SmmuSignalMsi with translated address. + /// 6. Fault: access unmapped IOVA, verify EVTQ event. + #[test] + fn test_end_to_end_linux_driver_sequence() { + use crate::spec::cd::Cd; + use crate::spec::cd::CdDw0; + use crate::spec::cd::CdDw1; + use crate::spec::cd::Ips; + use crate::spec::cd::Tg0; + use crate::spec::commands::CmdCfgiCd; + use crate::spec::commands::CmdCfgiSte; + use crate::spec::commands::CmdCfgiSteRange; + use crate::spec::commands::CmdOpcode; + use crate::spec::commands::CmdSync; + use crate::spec::commands::SyncCs; + use crate::spec::events::EventId; + use crate::spec::pt::ApBits; + use crate::spec::pt::PtDesc; + use crate::spec::ste::STE_SIZE; + use crate::spec::ste::Ste; + use crate::spec::ste::SteConfig; + use crate::spec::ste::SteDw0; + use crate::spec::ste::SteDw1; + use parking_lot::Mutex; + use pci_core::bus_range::AssignedBusRange; + use pci_core::msi::SignalMsi; + use std::sync::Arc; + + // ===================================================================== + // Memory layout constants + // ===================================================================== + + const STRTAB_GPA: u64 = 0x10_0000; // Stream table + const STRTAB_LOG2SIZE: u8 = 10; // 1024 entries + const CMDQ_GPA: u64 = 0x20_0000; // Command queue + const CMDQ_LOG2SIZE: u8 = 5; // 32 entries + const EVTQ_GPA: u64 = 0x30_0000; // Event queue + const EVTQ_LOG2SIZE: u8 = 5; // 32 entries + const CD_GPA: u64 = 0x40_0000; // Context descriptor table + const PT_L1_GPA: u64 = 0x50_1000; // L1 page table + const PT_L2_GPA: u64 = 0x50_2000; // L2 page table + const PT_L3_GPA: u64 = 0x50_3000; // L3 page table + const DATA_GPA: u64 = 0x6000_0000; // Translated target page + const DOORBELL_GPA: u64 = 0x7000_0000; // MSI doorbell physical page + const SYNC_MSI_GPA: u64 = 0x80_0000; // CMD_SYNC MSI target + const EVTQ_MSI_GPA: u64 = 0x80_0100; // EVTQ MSI target + + // IOVA space layout (guest-programmed) + const DMA_IOVA: u64 = 0x0000_0000; // Maps to DATA_GPA + const DOORBELL_IOVA: u64 = 0x0800_0000; // Maps to DOORBELL_GPA + + // Device identity + const SEGMENT: u16 = 0; + const BUS: u8 = 1; + const STREAM_ID_BASE: u32 = (SEGMENT as u32) << 16; + const STREAM_ID: u32 = STREAM_ID_BASE + ((BUS as u32) << 8); + + // ===================================================================== + // Mock MSI target + // ===================================================================== + + struct MockSignalMsi { + calls: Mutex, u64, u32)>>, + } + + impl MockSignalMsi { + fn new() -> Arc { + Arc::new(Self { + calls: Mutex::new(Vec::new()), + }) + } + + fn take_calls(&self) -> Vec<(Option, u64, u32)> { + std::mem::take(&mut *self.calls.lock()) + } + } + + impl SignalMsi for MockSignalMsi { + fn signal_msi(&self, devid: Option, address: u64, data: u32) { + self.calls.lock().push((devid, address, data)); + } + } + + // Helper to write a command entry to the CMDQ at a given index. + fn write_cmd(gm: &GuestMemory, index: u32, entry: &CmdEntry) { + let addr = CMDQ_GPA + (index as u64) * (size_of::() as u64); + gm.write_plain(addr, entry).expect("write cmd entry"); + } + + // ===================================================================== + // Allocate guest memory and create device + // ===================================================================== + + let gm = GuestMemory::allocate(0x8000_0000); // 2 GiB + let mut dev = SmmuDevice::new( + TEST_MMIO_BASE, + gm.clone(), + &SmmuConfig::default(), + None, + None, + ); + + // ===================================================================== + // Step 1: Probe — read IDR registers (arm_smmu_device_hw_probe) + // ===================================================================== + + let idr0 = Idr0::from(read32(&mut dev, IDR0)); + assert!(idr0.s1p(), "S1 translation must be supported"); + assert_eq!(idr0.ttf(), 0b10, "TTF must include AArch64"); + assert!(!idr0.msi(), "MSI must not be advertised (wired SPIs)"); + assert_eq!(idr0.ttendian(), 0b10, "Must be little-endian"); + assert_eq!(idr0.st_level(), 0b00, "Must be linear stream table"); + + let idr1 = Idr1::from(read32(&mut dev, IDR1)); + assert_eq!(idr1.sidsize(), 16); + assert!(idr1.cmdqs() >= 5, "CMDQS must support our queue size"); + + let idr5 = Idr5::from(read32(&mut dev, IDR5)); + assert!(idr5.gran4k(), "4K granule must be supported"); + + // ===================================================================== + // Step 2: Reset — arm_smmu_device_reset() sequence + // ===================================================================== + + // 2a. Disable SMMU. + write32(&mut dev, CR0, 0); + assert_eq!( + read32(&mut dev, CR0ACK), + 0, + "CR0ACK must reflect disabled state" + ); + + // 2b. Program CR1 (memory attributes for table walks). + let cr1 = Cr1::new() + .with_table_sh(0b11) // Inner shareable + .with_table_oc(0b01) // Write-back + .with_table_ic(0b01) // Write-back + .with_queue_sh(0b11) + .with_queue_oc(0b01) + .with_queue_ic(0b01); + write32(&mut dev, CR1, cr1.into()); + + // 2c. Program stream table base. + let strtab_base = StrtabBase::new().with_addr_bits(STRTAB_GPA >> 6); + write64(&mut dev, STRTAB_BASE, strtab_base.into()); + let strtab_cfg = StrtabBaseCfg::new() + .with_log2size(STRTAB_LOG2SIZE) + .with_fmt(0); // Linear + write32(&mut dev, STRTAB_BASE_CFG, strtab_cfg.into()); + + // Verify readback. + assert_eq!( + StrtabBase::from(read64(&mut dev, STRTAB_BASE)).addr(), + STRTAB_GPA + ); + + // 2d. Program CMDQ. + let cmdq_base = QueueBase::new() + .with_log2size(CMDQ_LOG2SIZE) + .with_addr_bits(CMDQ_GPA >> 5); + write64(&mut dev, CMDQ_BASE, cmdq_base.into()); + write32(&mut dev, CMDQ_PROD, 0); + // CMDQ_CONS is SMMU-writable only; starts at 0. + + // 2e. Enable CMDQEN. + let cr0_cmdqen = Cr0::new().with_cmdqen(true); + write32(&mut dev, CR0, cr0_cmdqen.into()); + assert_eq!( + Cr0::from(read32(&mut dev, CR0ACK)).cmdqen(), + true, + "CMDQEN must be acknowledged" + ); + + // 2f. Issue CFGI_ALL + CMD_SYNC (invalidate all cached STEs). + let mut cmd_idx: u32 = 0; + + let cfgi_all = CmdEntry { + qw0: CmdCfgiSteRange::new() + .with_opcode(CmdOpcode::CFGI_STE_RANGE.0) + .into(), + qw1: CmdCfgiSteRange::RANGE_ALL as u64, + }; + write_cmd(&gm, cmd_idx, &cfgi_all); + cmd_idx += 1; + + let sync0 = CmdEntry { + qw0: CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_IRQ.0) + .with_msi_data(0xAAAA) + .into(), + qw1: (SYNC_MSI_GPA >> 2) << 2, + }; + write_cmd(&gm, cmd_idx, &sync0); + cmd_idx += 1; + + write32(&mut dev, CMDQ_PROD, cmd_idx); + + // Verify CONS advanced. + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), cmd_idx, "CMDQ_CONS must advance to PROD"); + + // Verify CMD_SYNC MSI written. + let sync_val: u32 = gm.read_plain(SYNC_MSI_GPA).expect("read sync MSI"); + assert_eq!(sync_val, 0xAAAA, "CMD_SYNC MSI data must match"); + + // 2g. Issue TLBI_NSNH_ALL + CMD_SYNC. + let tlbi_all = CmdEntry { + qw0: CmdOpcode::TLBI_NSNH_ALL.0 as u64, + qw1: 0, + }; + write_cmd(&gm, cmd_idx, &tlbi_all); + cmd_idx += 1; + + // Reset sync target. + gm.write_at(SYNC_MSI_GPA, &0u32.to_le_bytes()).unwrap(); + + let sync1 = CmdEntry { + qw0: CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_IRQ.0) + .with_msi_data(0xBBBB) + .into(), + qw1: (SYNC_MSI_GPA >> 2) << 2, + }; + write_cmd(&gm, cmd_idx, &sync1); + cmd_idx += 1; + + write32(&mut dev, CMDQ_PROD, cmd_idx); + + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), cmd_idx); + let sync_val: u32 = gm.read_plain(SYNC_MSI_GPA).expect("read sync MSI"); + assert_eq!(sync_val, 0xBBBB); + + // 2h. Program EVTQ. + let evtq_base = QueueBase::new() + .with_log2size(EVTQ_LOG2SIZE) + .with_addr_bits(EVTQ_GPA >> 5); + write64(&mut dev, EVENTQ_BASE, evtq_base.into()); + + // Program EVTQ MSI config. + write64(&mut dev, EVENTQ_IRQ_CFG0, EVTQ_MSI_GPA); + write32(&mut dev, EVENTQ_IRQ_CFG1, 0xDEAD); + + // 2i. Enable EVTQEN. + let cr0_evtqen = Cr0::new().with_cmdqen(true).with_eventqen(true); + write32(&mut dev, CR0, cr0_evtqen.into()); + assert!(Cr0::from(read32(&mut dev, CR0ACK)).eventqen()); + + // 2j. Enable EVENTQ IRQ. + let irq_ctrl = IrqCtrl::new().with_eventq_irqen(true); + write32(&mut dev, IRQ_CTRL, irq_ctrl.into()); + assert!(IrqCtrl::from(read32(&mut dev, IRQ_CTRLACK)).eventq_irqen()); + + // 2k. Enable SMMUEN. + let cr0_full = Cr0::new() + .with_cmdqen(true) + .with_eventqen(true) + .with_smmuen(true); + write32(&mut dev, CR0, cr0_full.into()); + let cr0ack = Cr0::from(read32(&mut dev, CR0ACK)); + assert!(cr0ack.smmuen(), "SMMUEN must be acknowledged"); + assert!(cr0ack.cmdqen()); + assert!(cr0ack.eventqen()); + + // ===================================================================== + // Step 3: Attach device — configure STE and CD for stream ID + // ===================================================================== + + // 3a. Write STE: S1_TRANS mode, point to CD table at CD_GPA. + let ste = Ste { + qw0: SteDw0::new() + .with_v(true) + .with_config(SteConfig::S1_TRANS.0) + .with_s1_context_ptr(CD_GPA >> 6) + .with_s1_cd_max(0), // Single CD (SSID=0 only) + qw1: SteDw1::new(), + _qw2_7: [0u64; 6], + }; + let ste_addr = STRTAB_GPA + (STREAM_ID as u64) * (STE_SIZE as u64); + gm.write_plain(ste_addr, &ste).expect("write STE"); + + // 3b. Write CD: TTB0 = PT_L1_GPA, T0SZ=32 (32-bit VA), 4K granule, 40-bit OAS. + let cd = Cd { + qw0: CdDw0::new() + .with_v(true) + .with_t0sz(32) + .with_tg0(Tg0::GRAN_4K.0) + .with_ips(Ips::IPS_40.0) + .with_aa64(true) + .with_asid(1), + qw1: CdDw1::new().with_ttb0(PT_L1_GPA >> 4), + _qw2: 0, + mair0: 0xFF440C0400, + mair1: 0, + _qw5_7: [0; 3], + }; + let cd_addr = CD_GPA; // SSID=0 + gm.write_plain(cd_addr, &cd).expect("write CD"); + + // 3c. Build page table hierarchy for DMA region: + // IOVA 0x0000_0000..0x0000_0FFF → DATA_GPA + // T0SZ=32, 4K granule → 3-level walk (L1, L2, L3). + // + // L1[0] → L2 table + let l1_desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(PT_L2_GPA >> 12); + gm.write_plain::(PT_L1_GPA, &l1_desc.into()) + .expect("write L1"); + + // L2[0] → L3 table + let l2_desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(PT_L3_GPA >> 12); + gm.write_plain::(PT_L2_GPA, &l2_desc.into()) + .expect("write L2"); + + // L3[0] → page at DATA_GPA (RW, AF=1) + let l3_desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) // L3: type=1 means page + .with_af(true) + .with_ap(ApBits::RW_EL1.0) + .with_addr_bits(DATA_GPA >> 12); + gm.write_plain::(PT_L3_GPA, &l3_desc.into()) + .expect("write L3[0]"); + + // 3d. Build page table for doorbell region (for MSI translation): + // IOVA 0x0800_0000 → DOORBELL_GPA + // L1 index = 0x0800_0000 >> 30 = 0 (same L1 entry) + // L2 index = (0x0800_0000 >> 21) & 0x1FF = 64 + // L3 index = (0x0800_0000 >> 12) & 0x1FF = 0 + // + // We need a separate L2→L3 chain for L2[64]. + const DOORBELL_PT_L3_GPA: u64 = 0x50_4000; + + // L2[64] → doorbell L3 table + let l2_doorbell_desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(DOORBELL_PT_L3_GPA >> 12); + let l2_doorbell_offset = 64 * 8; // L2 index 64, 8 bytes per entry + gm.write_plain::(PT_L2_GPA + l2_doorbell_offset, &l2_doorbell_desc.into()) + .expect("write L2[64]"); + + // Doorbell L3[0] → page at DOORBELL_GPA + let l3_doorbell_desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_af(true) + .with_ap(ApBits::RW_EL1.0) + .with_addr_bits(DOORBELL_GPA >> 12); + gm.write_plain::(DOORBELL_PT_L3_GPA, &l3_doorbell_desc.into()) + .expect("write doorbell L3[0]"); + + // 3e. Issue CFGI_STE + CFGI_CD + CMD_SYNC via CMDQ. + let cfgi_ste = CmdEntry { + qw0: CmdCfgiSte::new() + .with_opcode(CmdOpcode::CFGI_STE.0) + .with_sid(STREAM_ID) + .into(), + qw1: 0, + }; + write_cmd(&gm, cmd_idx, &cfgi_ste); + cmd_idx += 1; + + let cfgi_cd = CmdEntry { + qw0: CmdCfgiCd::new() + .with_opcode(CmdOpcode::CFGI_CD.0) + .with_sid(STREAM_ID) + .with_ssid(0) + .into(), + qw1: 0, + }; + write_cmd(&gm, cmd_idx, &cfgi_cd); + cmd_idx += 1; + + // Reset sync target. + gm.write_at(SYNC_MSI_GPA, &0u32.to_le_bytes()).unwrap(); + + let sync2 = CmdEntry { + qw0: CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_IRQ.0) + .with_msi_data(0xCCCC) + .into(), + qw1: (SYNC_MSI_GPA >> 2) << 2, + }; + write_cmd(&gm, cmd_idx, &sync2); + cmd_idx += 1; + + write32(&mut dev, CMDQ_PROD, cmd_idx); + + let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS)); + assert_eq!(cons.rd(), cmd_idx, "All commands must be consumed"); + let sync_val: u32 = gm.read_plain(SYNC_MSI_GPA).expect("read sync MSI"); + assert_eq!(sync_val, 0xCCCC, "CFGI+SYNC completion must be signaled"); + + // ===================================================================== + // Step 4: DMA — read/write through SmmuTranslatingMemory + // ===================================================================== + + // Create per-device wrappers. + let shared_state = dev.shared_state().clone(); + let bus_range = AssignedBusRange::new(); + bus_range.set_bus_range(BUS, BUS); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, smmu_msi) = + shared_state.create_device_context(bus_range, STREAM_ID_BASE, &gm, mock_msi.clone()); + + // 4a. Write test data at DATA_GPA via raw guest memory. + let test_data = b"Hello from SMMU end-to-end test!"; + gm.write_at(DATA_GPA, test_data).unwrap(); + + // 4b. Read via IOVA → should get data from DATA_GPA. + let mut buf = vec![0u8; test_data.len()]; + translating_gm + .read_at(DMA_IOVA, &mut buf) + .expect("DMA read through SMMU must succeed"); + assert_eq!(&buf, test_data, "Translated read must return correct data"); + + // 4c. Write via IOVA with an offset. + let write_data = b"DMA write OK"; + let write_offset = 0x100u64; + translating_gm + .write_at(DMA_IOVA + write_offset, write_data) + .expect("DMA write through SMMU must succeed"); + + // Verify at raw GPA. + let mut verify_buf = vec![0u8; write_data.len()]; + gm.read_at(DATA_GPA + write_offset, &mut verify_buf) + .unwrap(); + assert_eq!( + &verify_buf, write_data, + "Translated write must land at correct GPA" + ); + + // ===================================================================== + // Step 5: MSI — translate MSI address through SMMU + // ===================================================================== + + // Fire MSI with address = DOORBELL_IOVA + 0x40 (intra-page offset). + // The SMMU should translate DOORBELL_IOVA → DOORBELL_GPA. + // devid is a RID: (bus << 8 | devfn). Must be within the device's + // assigned bus range for the SMMU to accept it. + let device_rid = (BUS as u32) << 8; // devfn = 0 + smmu_msi.signal_msi(Some(device_rid), DOORBELL_IOVA + 0x40, 0x1234); + + let msi_calls = mock_msi.take_calls(); + assert_eq!(msi_calls.len(), 1, "Exactly one MSI must be forwarded"); + let (devid, addr, data) = &msi_calls[0]; + assert_eq!(*devid, Some(device_rid), "devid must be passed through"); + assert_eq!( + *addr, + DOORBELL_GPA + 0x40, + "MSI address must be translated with offset" + ); + assert_eq!(*data, 0x1234, "MSI data must be passed through"); + + // ===================================================================== + // Step 6: Fault — access unmapped IOVA, verify EVTQ event + // ===================================================================== + + // IOVA 0x1000_0000 has no page table mapping → translation fault. + let unmapped_iova: u64 = 0x1000_0000; + let mut fault_buf = [0u8; 4]; + let result = translating_gm.read_at(unmapped_iova, &mut fault_buf); + assert!(result.is_err(), "Read from unmapped IOVA must return error"); + + // The fault event is queued in shared state. Trigger a drain by + // writing CMDQ_PROD (which drains pending events). + write32(&mut dev, CMDQ_PROD, cmd_idx); // No new commands, just drain. + + // Verify EVTQ_PROD advanced (an event was written). + let evtq_prod = read32_page1(&mut dev, EVENTQ_PROD_PAGE1); + assert!(evtq_prod > 0, "EVTQ must have at least one event"); + + // Read the event from guest memory. + let event: EvtEntry = gm.read_plain(EVTQ_GPA).expect("read fault event"); + assert_eq!( + event.event_id(), + EventId::F_TRANSLATION, + "Fault must be a translation fault" + ); + assert_eq!(event.sid, STREAM_ID, "Fault SID must match device"); + assert_eq!( + event.input_addr, unmapped_iova, + "Fault IOVA must match access" + ); + } +} diff --git a/vm/devices/iommu/smmu/src/lib.rs b/vm/devices/iommu/smmu/src/lib.rs new file mode 100644 index 0000000000..56ec3af54c --- /dev/null +++ b/vm/devices/iommu/smmu/src/lib.rs @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 emulator for OpenVMM. +//! +//! This crate implements an Arm SMMUv3 (System Memory Management Unit) +//! emulator, providing IOVA→GPA translation for devices behind the SMMU. + +pub mod spec; + +mod emulator; +mod shared; +mod translate; + +pub use emulator::SmmuConfig; +pub use emulator::SmmuDevice; +pub use shared::SmmuSharedState; +pub use shared::SmmuSignalMsi; +pub use shared::SmmuTranslatingMemory; diff --git a/vm/devices/iommu/smmu/src/shared.rs b/vm/devices/iommu/smmu/src/shared.rs new file mode 100644 index 0000000000..42e8a0bf14 --- /dev/null +++ b/vm/devices/iommu/smmu/src/shared.rs @@ -0,0 +1,1371 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Shared SMMU state and per-device translation wrappers. +//! +//! [`SmmuSharedState`] holds the SMMU configuration that per-device wrappers +//! need for translation: stream table base, CR0 state, and a reference to +//! guest memory for walking page tables. +//! +//! [`SmmuTranslatingMemory`] implements [`GuestMemoryAccess`], translating +//! IOVAs to GPAs via the SMMU page tables before accessing the underlying +//! guest memory. +//! +//! [`SmmuSignalMsi`] implements [`SignalMsi`], translating the MSI address +//! (which may be an IOVA) to a GPA before forwarding to the inner MSI +//! target. +//! +//! [`SmmuIrqFd`] implements [`IrqFd`](vmcore::irqfd::IrqFd), producing +//! [`SmmuIrqFdRoute`] instances that translate the MSI address on +//! [`enable`](vmcore::irqfd::IrqFdRoute::enable) before forwarding to the +//! inner irqfd route. + +use crate::spec::events::EvtEntry; +use crate::spec::registers; +use crate::translate; +use guestmem::GuestMemory; +use guestmem::GuestMemoryBackingError; +use pal_event::Event; +use parking_lot::Mutex; +use parking_lot::RwLock; +use pci_core::bus_range::AssignedBusRange; +use pci_core::msi::SignalMsi; +use std::fmt; +use std::ptr::NonNull; +use std::sync::Arc; +use vmcore::irqfd::IrqFd; +use vmcore::irqfd::IrqFdRoute; +use vmcore::line_interrupt::LineInterrupt; +use zerocopy::IntoBytes; + +/// Composes an SMMU-local stream ID from a bus range, a base offset, +/// and an optional per-device BDF. +/// +/// The stream ID is `stream_id_base + (bdf & 0xFFFF)`. When `devid` +/// is `None`, the default BDF `(secondary_bus, dev 0, fn 0)` is used. +/// +/// Returns `None` if the secondary bus has not been assigned yet +/// (still 0) or if the BDF's bus number falls outside the port's +/// assigned range. +fn compose_stream_id( + bus_range: &AssignedBusRange, + stream_id_base: u32, + devid: Option, +) -> Option { + let (secondary, subordinate) = bus_range.bus_range(); + if secondary == 0 { + return None; + } + let bdf = devid.unwrap_or((secondary as u32) << 8); + let bus = (bdf >> 8) as u8; + if bus < secondary || bus > subordinate { + tracelimit::warn_ratelimited!(bus, secondary, subordinate, "BDF out of port bus range"); + return None; + } + Some(stream_id_base + (bdf & 0xFFFF)) +} + +/// Translation error for SMMU DMA access. +#[derive(Debug)] +struct SmmuTranslationError { + iova: u64, + msg: &'static str, +} + +impl fmt::Display for SmmuTranslationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "SMMU translation failed: {} at IOVA {:#x}", + self.msg, self.iova + ) + } +} + +impl std::error::Error for SmmuTranslationError {} + +/// Result of an SMMU translation attempt. +#[derive(Debug)] +enum TranslateResult { + /// SMMU disabled or bus not yet assigned — bypass (IOVA = GPA). + Bypass, + /// Translated GPA. + Translated(u64), + /// Abort — STE says to abort this stream's DMA. + Abort(EvtEntry), + /// Translation fault — event to queue. + Fault(EvtEntry), +} + +/// Shared SMMU state accessed by per-device translation wrappers. +/// +/// The SMMU device updates this state on register writes; per-device wrappers +/// read it during translation. The `RwLock` allows concurrent translations +/// (read path) while register writes (write path) are exclusive. +/// +/// Queue and error state is behind a separate `Mutex` so that per-device +/// wrappers can write fault events and signal overflow without going through +/// the emulator. +pub struct SmmuSharedState { + /// Translation configuration — RwLock for concurrent DMA reads. + inner: RwLock, + /// Guest memory for reading page tables and stream table entries. + guest_memory: GuestMemory, + /// Event queue and global error state — single mutex covers both + /// because the EVTQ overflow path needs to update GERROR atomically. + queue_state: Mutex, + /// Wired SPI interrupt line for event queue signaling. + evtq_irq: Option, + /// Wired SPI interrupt line for global error signaling. + gerror_irq: Option, +} + +struct SharedStateInner { + /// Whether the SMMU is enabled (CR0.SMMUEN). + enabled: bool, + /// Stream table base address. + strtab_base: u64, + /// Stream table log2 size (number of entries). + strtab_log2size: u8, +} + +/// Event queue and global error state. +/// +/// A single mutex serializes event writes from concurrent DMA fault +/// paths, GERROR updates from both the emulator and DMA overflow, +/// and interrupt line level changes. +struct QueueErrorState { + // -- Event queue -- + /// EVTQ base GPA (parsed from EVTQ_BASE register). + evtq_base_addr: u64, + /// EVTQ log2 size (clamped to IDR1.EVENTQS). + evtq_log2size: u8, + /// Whether the event queue is enabled (CR0.EVENTQEN). + evtq_enabled: bool, + /// Whether the EVTQ interrupt is enabled (IRQ_CTRL.EVENTQ_IRQEN). + evtq_irqen: bool, + /// Producer index (advanced by the SMMU when writing events). + evtq_prod: u32, + /// Consumer index (advanced by the guest via MMIO). + evtq_cons: u32, + + // -- Global error registers (toggle protocol) -- + /// GERROR register — individual error bits toggled by the SMMU. + gerror: registers::Gerror, + /// GERRORN register — written by the guest to acknowledge errors. + gerrorn: registers::Gerror, + /// Whether the GERROR interrupt is enabled (IRQ_CTRL.GERROR_IRQEN). + gerror_irqen: bool, +} + +impl SmmuSharedState { + /// Creates a new shared state with the SMMU disabled. + pub fn new( + guest_memory: GuestMemory, + evtq_irq: Option, + gerror_irq: Option, + ) -> Arc { + Arc::new(Self { + inner: RwLock::new(SharedStateInner { + enabled: false, + strtab_base: 0, + strtab_log2size: 0, + }), + guest_memory, + queue_state: Mutex::new(QueueErrorState { + evtq_base_addr: 0, + evtq_log2size: 0, + evtq_enabled: false, + evtq_irqen: false, + evtq_prod: 0, + evtq_cons: 0, + gerror: registers::Gerror::new(), + gerrorn: registers::Gerror::new(), + gerror_irqen: false, + }), + evtq_irq, + gerror_irq, + }) + } + + /// Updates the SMMU enable state (called by SmmuDevice on CR0 writes). + pub fn set_enabled(&self, enabled: bool) { + self.inner.write().enabled = enabled; + } + + /// Updates the stream table configuration (called by SmmuDevice on + /// STRTAB_BASE / STRTAB_BASE_CFG writes). + pub fn set_strtab(&self, base: u64, log2size: u8) { + let mut inner = self.inner.write(); + inner.strtab_base = base; + inner.strtab_log2size = log2size; + } + + /// Updates the event queue configuration (called by SmmuDevice on + /// EVTQ_BASE writes). + pub fn set_evtq_config(&self, base_addr: u64, log2size: u8) { + let mut qs = self.queue_state.lock(); + qs.evtq_base_addr = base_addr; + qs.evtq_log2size = log2size; + } + + /// Updates the event queue enabled state (called on CR0 writes). + pub fn set_evtq_enabled(&self, enabled: bool) { + self.queue_state.lock().evtq_enabled = enabled; + } + + /// Updates both interrupt enable flags from IRQ_CTRL (called on + /// IRQ_CTRL writes). Also updates the GERROR interrupt line level. + pub fn set_irq_ctrl(&self, evtq_irqen: bool, gerror_irqen: bool) { + let mut qs = self.queue_state.lock(); + qs.evtq_irqen = evtq_irqen; + qs.gerror_irqen = gerror_irqen; + self.update_gerror_irq(&qs); + } + + /// Reads the current GERROR register value. + pub fn read_gerror(&self) -> registers::Gerror { + self.queue_state.lock().gerror + } + + /// Reads the current GERRORN register value. + pub fn read_gerrorn(&self) -> registers::Gerror { + self.queue_state.lock().gerrorn + } + + /// Returns true if GERROR.CMDQ_ERR != GERRORN.CMDQ_ERR (error active). + pub fn cmdq_err_active(&self) -> bool { + let qs = self.queue_state.lock(); + qs.gerror.cmdq_err() != qs.gerrorn.cmdq_err() + } + + /// Writes GERRORN (guest acknowledging errors) and updates the + /// interrupt line level. + pub fn write_gerrorn(&self, value: u32) { + let mut qs = self.queue_state.lock(); + qs.gerrorn = registers::Gerror::from(value); + self.update_gerror_irq(&qs); + } + + /// Toggles GERROR.CMDQ_ERR to signal a command queue error. + /// + /// Updates the interrupt line level under the lock. + pub fn toggle_cmdq_err(&self) { + let mut qs = self.queue_state.lock(); + let new_val = !qs.gerror.cmdq_err(); + qs.gerror.set_cmdq_err(new_val); + self.update_gerror_irq(&qs); + } + + /// Signals an EVTQ overflow by making GERROR.EVTQ_ABT_ERR active. + /// + /// Per spec, sets the bit to the inverse of GERRORN.EVTQ_ABT_ERR. + /// If the error is already active this is a no-op (the bit value + /// doesn't change). Called from `write_event` under the same lock. + fn signal_evtq_overflow(&self, qs: &mut QueueErrorState) { + let new_val = !qs.gerrorn.eventq_abt_err(); + qs.gerror.set_eventq_abt_err(new_val); + self.update_gerror_irq(qs); + } + + /// Updates the GERROR wired interrupt line level based on current state. + /// + /// Must be called with the queue_state lock held. The line is held + /// high while any error is active (GERROR != GERRORN) and deasserted + /// when all errors are acknowledged. + fn update_gerror_irq(&self, qs: &QueueErrorState) { + if let Some(irq) = &self.gerror_irq { + let active = qs.gerror_irqen && u32::from(qs.gerror) != u32::from(qs.gerrorn); + irq.set_level(active); + } + } + + /// Updates the event queue consumer index (called when the guest + /// writes EVENTQ_CONS on page 1). + /// + /// Deasserts the EVTQ wired interrupt if the queue is now empty. + pub fn set_evtq_cons(&self, cons: u32) { + let mut qs = self.queue_state.lock(); + qs.evtq_cons = cons; + // Deassert EVTQ IRQ when the guest has drained all events. + if qs.evtq_irqen && qs.evtq_prod == qs.evtq_cons { + if let Some(irq) = &self.evtq_irq { + irq.set_level(false); + } + } + } + + /// Returns the current event queue producer index (for guest reads + /// of EVENTQ_PROD on page 1). + pub fn evtq_prod(&self) -> u32 { + self.queue_state.lock().evtq_prod + } + + /// Returns the current event queue consumer index (for guest reads + /// of EVENTQ_CONS on page 1). + pub fn evtq_cons(&self) -> u32 { + self.queue_state.lock().evtq_cons + } + + /// Resets event queue and GERROR state (called on device reset). + pub fn reset_queue_state(&self) { + let mut qs = self.queue_state.lock(); + qs.evtq_base_addr = 0; + qs.evtq_log2size = 0; + qs.evtq_enabled = false; + qs.evtq_irqen = false; + qs.evtq_prod = 0; + qs.evtq_cons = 0; + qs.gerror = registers::Gerror::new(); + qs.gerrorn = registers::Gerror::new(); + qs.gerror_irqen = false; + self.update_gerror_irq(&qs); + } + + /// Translate an IOVA to a GPA for the given stream ID. + /// + /// Callers that need to hold the lock across translation and a subsequent + /// memory access should use [`translate_with`] instead. + fn translate(&self, sid: u32, iova: u64, write: bool) -> TranslateResult { + let inner = self.inner.read(); + self.translate_locked(&inner, sid, iova, write) + } + + /// Translate an IOVA to a GPA while holding the read lock. + /// + /// The caller holds `inner` across both translation and the subsequent + /// memory access, preventing SMMU config changes (disable, stream table + /// base update) from creating a TOCTOU between translation and access. + fn translate_locked( + &self, + inner: &SharedStateInner, + sid: u32, + iova: u64, + write: bool, + ) -> TranslateResult { + if !inner.enabled { + return TranslateResult::Bypass; + } + + // Look up the STE. + let ste = match translate::lookup_ste( + &self.guest_memory, + inner.strtab_base, + inner.strtab_log2size, + sid, + ) { + Ok(ste) => ste, + Err(fault) => return TranslateResult::Fault(fault.event), + }; + + // Dispatch on STE config. + let action = match translate::ste_config_action(&ste) { + Ok(action) => action, + Err(_) => return TranslateResult::Fault(EvtEntry::bad_ste(sid)), + }; + + match action { + translate::SteAction::Abort => TranslateResult::Abort(EvtEntry::bad_ste(sid)), + translate::SteAction::Bypass => TranslateResult::Bypass, + translate::SteAction::S1Translate => { + // Look up the CD. + let cd = match translate::lookup_cd(&self.guest_memory, &ste, sid, 0) { + Ok(cd) => cd, + Err(fault) => return TranslateResult::Fault(fault.event), + }; + + // Extract translation context. + let ctx = match translate::translation_context(&cd, sid) { + Ok(ctx) => ctx, + Err(fault) => return TranslateResult::Fault(fault.event), + }; + + // Walk the page table. + match translate::walk_s1(&self.guest_memory, &ctx, iova, write, sid) { + Ok(tr) => TranslateResult::Translated(tr.gpa), + Err(fault) => TranslateResult::Fault(fault.event), + } + } + } + } + + /// Write an event record directly to the guest's event queue. + /// + /// Called from per-device DMA fault paths and from the emulator's + /// command processing. If the queue is full, drops the event and + /// logs a warning. If an event is successfully written, pulses + /// the EVTQ wired SPI interrupt (if enabled). + pub fn write_event(&self, event: EvtEntry) { + let mut qs = self.queue_state.lock(); + if !qs.evtq_enabled { + return; + } + + let max_entries = 1u32 << qs.evtq_log2size; + let index_mask = (max_entries << 1) - 1; + let prod = qs.evtq_prod & index_mask; + let cons = qs.evtq_cons & index_mask; + + // Check if the queue is full. Full when the index bits match but + // the wrap bit differs: (prod ^ cons) == max_entries. + if (prod ^ cons) == max_entries { + // Signal EVTQ overflow via GERROR.EVTQ_ABT_ERR — updates + // the GERROR register and interrupt line under the same lock. + self.signal_evtq_overflow(&mut qs); + tracelimit::warn_ratelimited!("smmu: EVTQ full, dropping event"); + return; + } + + // Write the 32-byte event record to guest memory. + let index = prod & (max_entries - 1); + let entry_addr = qs.evtq_base_addr + (index as u64) * (EvtEntry::SIZE as u64); + + if let Err(e) = self.guest_memory.write_at(entry_addr, event.as_bytes()) { + tracelimit::warn_ratelimited!( + error = &e as &dyn std::error::Error, + entry_addr, + "smmu: failed to write EVTQ entry to guest memory" + ); + return; + } + + // Advance EVTQ_PROD. + qs.evtq_prod = (prod + 1) & index_mask; + + // Assert EVTQ wired interrupt — held high while queue is non-empty. + // Deasserted when the guest drains events via CONS writes. + if qs.evtq_irqen { + if let Some(irq) = &self.evtq_irq { + irq.set_level(true); + } + } + } + + /// Creates per-device wrappers for a PCI device behind this SMMU. + /// + /// `stream_id_base` is the offset into this SMMU's stream table for the + /// root complex this device belongs to (from the IORT `ID_MAPPING`). + /// The stream ID for translation is `stream_id_base + bdf`. + /// + /// Returns a `GuestMemory` that translates IOVAs via the SMMU, and a + /// `SignalMsi` implementation that translates MSI addresses. + pub fn create_device_context( + self: &Arc, + bus_range: AssignedBusRange, + stream_id_base: u32, + inner_gm: &GuestMemory, + inner_msi: Arc, + ) -> (GuestMemory, Arc) { + let translating_mem = SmmuTranslatingMemory { + shared: self.clone(), + bus_range: bus_range.clone(), + stream_id_base, + inner_gm: inner_gm.clone(), + }; + + let gm = GuestMemory::new("smmu-translating", translating_mem); + + let signal_msi = Arc::new(SmmuSignalMsi { + shared: self.clone(), + stream_id_base, + inner: inner_msi, + }); + + (gm, signal_msi) + } + + /// Creates an SMMU irqfd wrapper for a PCI device behind this SMMU. + /// + /// `stream_id_base` is the offset into this SMMU's stream table for the + /// root complex this device belongs to. + /// + /// Irqfd routes created from the returned wrapper will translate MSI + /// addresses through the SMMU page tables before programming the + /// kernel route. + pub fn create_irqfd( + self: &Arc, + stream_id_base: u32, + inner: Arc, + ) -> Arc { + Arc::new(SmmuIrqFd { + shared: self.clone(), + stream_id_base, + inner, + }) + } +} + +/// A [`guestmem::GuestMemoryAccess`] implementation that translates IOVAs via the SMMU. +/// +/// Each PCI device behind the SMMU gets its own `SmmuTranslatingMemory`. +/// When the device reads or writes guest memory using an IOVA, this +/// wrapper translates the IOVA to a GPA using the SMMU page tables, then +/// delegates to the underlying guest memory. +pub struct SmmuTranslatingMemory { + shared: Arc, + bus_range: AssignedBusRange, + /// Offset into the SMMU's stream table for this root complex. + stream_id_base: u32, + inner_gm: GuestMemory, +} + +impl SmmuTranslatingMemory { + /// Perform a translated memory operation, handling page-crossing accesses. + /// + /// Holds the SMMU read lock across both translation and memory access + /// for each page chunk, preventing config changes between translation + /// and the actual DMA. Splits at page boundaries when the IOVA range + /// spans multiple pages (which may have different translations). + fn do_translated_op( + &self, + iova: u64, + len: usize, + write: bool, + mut op: impl FnMut(u64, usize, usize) -> Result<(), GuestMemoryBackingError>, + ) -> Result<(), GuestMemoryBackingError> { + let sid = match compose_stream_id(&self.bus_range, self.stream_id_base, None) { + Some(sid) => sid, + None => { + // Bus not assigned — bypass, no lock needed. + let mut offset = 0usize; + let mut remaining = len; + while remaining > 0 { + let current_iova = iova.wrapping_add(offset as u64); + let page_offset = (current_iova & 0xFFF) as usize; + let bytes_in_page = (0x1000 - page_offset).min(remaining); + op(current_iova, offset, bytes_in_page)?; + offset += bytes_in_page; + remaining -= bytes_in_page; + } + return Ok(()); + } + }; + + let mut offset = 0usize; + let mut remaining = len; + + while remaining > 0 { + let current_iova = iova.wrapping_add(offset as u64); + + // Compute how many bytes until the next page boundary. + let page_offset = (current_iova & 0xFFF) as usize; + let bytes_in_page = (0x1000 - page_offset).min(remaining); + + // Hold the read lock across translate + memory access to prevent + // SMMU config from changing between getting the GPA and using it. + let inner = self.shared.inner.read(); + let gpa = match self + .shared + .translate_locked(&inner, sid, current_iova, write) + { + TranslateResult::Bypass => current_iova, + TranslateResult::Translated(gpa) => gpa, + TranslateResult::Abort(event) => { + drop(inner); + self.shared.write_event(event); + return Err(GuestMemoryBackingError::other( + current_iova, + SmmuTranslationError { + iova: current_iova, + msg: "DMA aborted by STE config", + }, + )); + } + TranslateResult::Fault(event) => { + drop(inner); + self.shared.write_event(event); + return Err(GuestMemoryBackingError::other( + current_iova, + SmmuTranslationError { + iova: current_iova, + msg: "translation fault", + }, + )); + } + }; + + op(gpa, offset, bytes_in_page)?; + drop(inner); + + offset += bytes_in_page; + remaining -= bytes_in_page; + } + + Ok(()) + } +} + +// UNSAFETY: SmmuTranslatingMemory returns `None` from `mapping()`, so the +// caller never gets a raw pointer. All accesses go through the fallback +// methods which translate IOVAs to GPAs and delegate to the inner +// GuestMemory. The inner GuestMemory is itself safe. +#[expect(unsafe_code)] +unsafe impl guestmem::GuestMemoryAccess for SmmuTranslatingMemory { + fn mapping(&self) -> Option> { + // Force all accesses through the fallback path for translation. + None + } + + fn max_address(&self) -> u64 { + // IOVAs can use the full address range; translation determines + // the actual valid range. + u64::MAX + } + + unsafe fn read_fallback( + &self, + addr: u64, + dest: *mut u8, + len: usize, + ) -> Result<(), GuestMemoryBackingError> { + self.do_translated_op(addr, len, false, |gpa, offset, chunk_len| { + // SAFETY: dest is valid for len bytes per the trait contract. + // We slice into dest at the correct offset. + let chunk_dest = unsafe { std::slice::from_raw_parts_mut(dest.add(offset), chunk_len) }; + self.inner_gm + .read_at(gpa, chunk_dest) + .map_err(|e| GuestMemoryBackingError::other(addr, e)) + }) + } + + unsafe fn write_fallback( + &self, + addr: u64, + src: *const u8, + len: usize, + ) -> Result<(), GuestMemoryBackingError> { + self.do_translated_op(addr, len, true, |gpa, offset, chunk_len| { + // SAFETY: src is valid for len bytes per the trait contract. + let chunk_src = unsafe { std::slice::from_raw_parts(src.add(offset), chunk_len) }; + self.inner_gm + .write_at(gpa, chunk_src) + .map_err(|e| GuestMemoryBackingError::other(addr, e)) + }) + } + + fn fill_fallback(&self, addr: u64, val: u8, len: usize) -> Result<(), GuestMemoryBackingError> { + self.do_translated_op(addr, len, true, |gpa, _offset, chunk_len| { + self.inner_gm + .fill_at(gpa, val, chunk_len) + .map_err(|e| GuestMemoryBackingError::other(addr, e)) + }) + } +} + +/// A [`SignalMsi`] wrapper that translates MSI addresses through the SMMU. +/// +/// When a device behind the SMMU fires an MSI, the MSI address may be an +/// IOVA (Linux maps MSI doorbell pages into the device's IOVA space via +/// `iommu_dma_prepare_msi()`). This wrapper translates the address before +/// forwarding to the inner MSI target (typically an ITS or GICv2m wrapper). +pub struct SmmuSignalMsi { + shared: Arc, + /// Offset into the SMMU's stream table for this root complex. + stream_id_base: u32, + inner: Arc, +} + +impl SignalMsi for SmmuSignalMsi { + fn signal_msi(&self, devid: Option, address: u64, data: u32) { + // MsiTarget resolves devid to a BDF before calling us. + let Some(bdf) = devid else { + return; + }; + let sid = self.stream_id_base + (bdf & 0xFFFF); + + match self.shared.translate(sid, address, true) { + TranslateResult::Bypass => { + self.inner.signal_msi(devid, address, data); + } + TranslateResult::Translated(gpa) => { + self.inner.signal_msi(devid, gpa, data); + } + TranslateResult::Abort(event) => { + self.shared.write_event(event); + tracelimit::warn_ratelimited!(sid, address, "smmu: MSI aborted by STE config"); + } + TranslateResult::Fault(event) => { + self.shared.write_event(event); + tracelimit::warn_ratelimited!(sid, address, "smmu: MSI translation fault"); + } + } + } +} + +/// An [`IrqFd`] wrapper that produces SMMU-translating irqfd routes. +/// +/// When a device behind the SMMU programs its MSI-X table, the MSI address +/// may be an IOVA. This wrapper creates [`SmmuIrqFdRoute`] instances that +/// translate the address through the SMMU before forwarding to the inner +/// irqfd route (which may itself be an ITS wrapper). +pub struct SmmuIrqFd { + shared: Arc, + /// Offset into the SMMU's stream table for this root complex. + stream_id_base: u32, + inner: Arc, +} + +impl IrqFd for SmmuIrqFd { + fn new_irqfd_route(&self) -> anyhow::Result> { + let inner_route = self.inner.new_irqfd_route()?; + Ok(Box::new(SmmuIrqFdRoute { + shared: self.shared.clone(), + stream_id_base: self.stream_id_base, + inner: inner_route, + })) + } +} + +/// An [`IrqFdRoute`] wrapper that translates the MSI address through the +/// SMMU on [`enable`](IrqFdRoute::enable). +/// +/// Translation happens at route-programming time (when the guest writes +/// the MSI-X table), not per-interrupt. If the guest changes SMMU page +/// tables after programming MSI-X, it must also re-program the MSI-X +/// entry (which is the normal flow — the IOMMU driver does this). +struct SmmuIrqFdRoute { + shared: Arc, + /// Offset into the SMMU's stream table for this root complex. + stream_id_base: u32, + inner: Box, +} + +impl IrqFdRoute for SmmuIrqFdRoute { + fn event(&self) -> &Event { + self.inner.event() + } + + fn enable(&self, address: u64, data: u32, devid: Option) { + // MsiRoute resolves devid to a BDF before calling us. + let Some(bdf) = devid else { + return; + }; + let sid = self.stream_id_base + (bdf & 0xFFFF); + + match self.shared.translate(sid, address, true) { + TranslateResult::Bypass => { + self.inner.enable(address, data, devid); + } + TranslateResult::Translated(gpa) => { + self.inner.enable(gpa, data, devid); + } + TranslateResult::Abort(event) => { + self.shared.write_event(event); + tracelimit::warn_ratelimited!( + sid, + address, + "smmu: irqfd MSI route aborted by STE config" + ); + } + TranslateResult::Fault(event) => { + self.shared.write_event(event); + tracelimit::warn_ratelimited!( + sid, + address, + "smmu: irqfd MSI route translation fault" + ); + } + } + } + + fn disable(&self) { + self.inner.disable(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::spec::cd::CD_SIZE; + use crate::spec::cd::CdDw0; + use crate::spec::cd::CdDw1; + use crate::spec::cd::Ips; + use crate::spec::cd::Tg0; + use crate::spec::events::EventId; + use crate::spec::pt::ApBits; + use crate::spec::pt::PtDesc; + use crate::spec::ste::STE_SIZE; + use crate::spec::ste::Ste; + use crate::spec::ste::SteConfig; + use crate::spec::ste::SteDw0; + use crate::spec::ste::SteDw1; + use parking_lot::Mutex; + use std::sync::Arc; + + // Memory layout for tests. + const STRTAB_BASE: u64 = 0x10_0000; + const STRTAB_LOG2SIZE: u8 = 10; + const CD_BASE: u64 = 0x20_0000; + const PT_L1_BASE: u64 = 0x30_1000; + const PT_L2_BASE: u64 = 0x30_2000; + const PT_L3_BASE: u64 = 0x30_3000; + const DATA_GPA: u64 = 0x4000_0000; + /// EVTQ base GPA for tests (must not overlap other test regions). + const EVTQ_BASE: u64 = 0x4100_0000; + /// EVTQ log2 size for tests (3 = 8 entries). + const EVTQ_LOG2SIZE: u8 = 3; + const TEST_SEGMENT: u16 = 0; + /// Stream ID base for the test root complex (matches IORT output_base). + const TEST_STREAM_ID_BASE: u32 = (TEST_SEGMENT as u32) << 16; + const TEST_BUS: u8 = 1; + /// The RID for the test device: (bus << 8) | devfn. + const TEST_RID: u32 = (TEST_BUS as u32) << 8; + + /// A mock SignalMsi that records calls. + struct MockSignalMsi { + calls: Mutex, u64, u32)>>, + } + + impl MockSignalMsi { + fn new() -> Arc { + Arc::new(Self { + calls: Mutex::new(Vec::new()), + }) + } + + fn take_calls(&self) -> Vec<(Option, u64, u32)> { + std::mem::take(&mut *self.calls.lock()) + } + } + + impl SignalMsi for MockSignalMsi { + fn signal_msi(&self, devid: Option, address: u64, data: u32) { + self.calls.lock().push((devid, address, data)); + } + } + + fn make_bus_range() -> AssignedBusRange { + let br = AssignedBusRange::new(); + br.set_bus_range(TEST_BUS, TEST_BUS); + br + } + + fn expected_sid() -> u32 { + TEST_STREAM_ID_BASE + ((TEST_BUS as u32) << 8) + } + + fn write_ste(gm: &GuestMemory, sid: u32, ste: &Ste) { + let addr = STRTAB_BASE + (sid as u64) * (STE_SIZE as u64); + gm.write_plain(addr, ste).expect("write STE"); + } + + fn make_s1_ste(cd_base: u64) -> Ste { + use crate::spec::cd::CD_SIZE; + let _ = CD_SIZE; + Ste { + qw0: SteDw0::new() + .with_v(true) + .with_config(SteConfig::S1_TRANS.0) + .with_s1_context_ptr(cd_base >> 6) + .with_s1_cd_max(0), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + } + } + + fn make_bypass_ste() -> Ste { + Ste { + qw0: SteDw0::new().with_v(true).with_config(SteConfig::BYPASS.0), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + } + } + + fn make_abort_ste() -> Ste { + Ste { + qw0: SteDw0::new().with_v(true).with_config(SteConfig::ABORT.0), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + } + } + + fn write_cd(gm: &GuestMemory, cd_base: u64, ssid: u32) { + use crate::spec::cd::Cd; + let cd = Cd { + qw0: CdDw0::new() + .with_v(true) + .with_t0sz(32) + .with_tg0(Tg0::GRAN_4K.0) + .with_ips(Ips::IPS_40.0) + .with_aa64(true) + .with_asid(1), + qw1: CdDw1::new().with_ttb0(PT_L1_BASE >> 4), + _qw2: 0, + mair0: 0xFF440C0400, + mair1: 0, + _qw5_7: [0; 3], + }; + let addr = cd_base + (ssid as u64) * (CD_SIZE as u64); + gm.write_plain(addr, &cd).expect("write CD"); + } + + fn table_desc(next_table: u64) -> u64 { + PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(next_table >> 12) + .into() + } + + fn page_desc(output_addr: u64) -> u64 { + PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_af(true) + .with_ap(ApBits::RW_EL1.0) + .with_addr_bits(output_addr >> 12) + .into() + } + + fn write_pt_desc(gm: &GuestMemory, addr: u64, desc: u64) { + gm.write_plain(addr, &desc).expect("write PT desc"); + } + + /// Set up a complete SMMU translation context: + /// STE (S1_TRANS) → CD → page table mapping IOVA 0..4K → DATA_GPA. + fn setup_translation(gm: &GuestMemory, sid: u32) { + // Write STE. + write_ste(gm, sid, &make_s1_ste(CD_BASE)); + // Write CD. + write_cd(gm, CD_BASE, 0); + // Build 3-level page table (T0SZ=32, 4K granule: L1, L2, L3). + // L1[0] → L2 + write_pt_desc(gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 + write_pt_desc(gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[0] → page at DATA_GPA + write_pt_desc(gm, PT_L3_BASE, page_desc(DATA_GPA)); + } + + fn make_shared_state(gm: &GuestMemory) -> Arc { + let state = SmmuSharedState::new(gm.clone(), None, None); + state.set_strtab(STRTAB_BASE, STRTAB_LOG2SIZE); + state.set_enabled(true); + // Enable EVTQ so fault events are written to guest memory. + state.set_evtq_config(EVTQ_BASE, EVTQ_LOG2SIZE); + state.set_evtq_enabled(true); + state + } + + /// Count events in the EVTQ by reading EVTQ_PROD from shared state. + fn evtq_event_count(state: &SmmuSharedState) -> u32 { + state.evtq_prod() + } + + // ========================================================================= + // SmmuTranslatingMemory tests + // ========================================================================= + + #[test] + fn test_translating_memory_basic_read() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + setup_translation(&gm, sid); + + // Write test data at the physical GPA. + let data = b"hello SMMU"; + gm.write_at(DATA_GPA, data).unwrap(); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Read via IOVA 0 → should get data from DATA_GPA. + let mut buf = vec![0u8; data.len()]; + translating_gm.read_at(0, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + #[test] + fn test_translating_memory_basic_write() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + setup_translation(&gm, sid); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Write via IOVA. + let data = b"write test"; + translating_gm.write_at(0, data).unwrap(); + + // Verify data appears at the physical GPA. + let mut buf = vec![0u8; data.len()]; + gm.read_at(DATA_GPA, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + #[test] + fn test_translating_memory_with_offset() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + setup_translation(&gm, sid); + + // Write data at GPA + 0x100. + let data = b"offset data"; + gm.write_at(DATA_GPA + 0x100, data).unwrap(); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Read via IOVA 0x100 → DATA_GPA + 0x100. + let mut buf = vec![0u8; data.len()]; + translating_gm.read_at(0x100, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + #[test] + fn test_translating_memory_cross_page() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + // Set up STE and CD. + write_ste(&gm, sid, &make_s1_ste(CD_BASE)); + write_cd(&gm, CD_BASE, 0); + + // Map two adjacent pages: + // L3[0] → DATA_GPA (page at IOVA 0x0000) + // L3[1] → DATA_GPA + 0x2000 (page at IOVA 0x1000) + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA)); + write_pt_desc(&gm, PT_L3_BASE + 8, page_desc(DATA_GPA + 0x2000)); + + // Write data spanning the page boundary. + let data_page1 = vec![0xAAu8; 0x10]; + let data_page2 = vec![0xBBu8; 0x10]; + gm.write_at(DATA_GPA + 0xFF0, &data_page1).unwrap(); + gm.write_at(DATA_GPA + 0x2000, &data_page2).unwrap(); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Read 32 bytes starting at IOVA 0xFF0, crossing into page 2. + let mut buf = vec![0u8; 0x20]; + translating_gm.read_at(0xFF0, &mut buf).unwrap(); + assert_eq!(&buf[..0x10], &data_page1); + assert_eq!(&buf[0x10..], &data_page2); + } + + #[test] + fn test_translating_memory_bypass() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + // STE in bypass mode. + write_ste(&gm, sid, &make_bypass_ste()); + + // Write data at GPA 0x1000. + let data = b"bypass data"; + gm.write_at(0x1000, data).unwrap(); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Read via IOVA = GPA (identity mapping in bypass mode). + let mut buf = vec![0u8; data.len()]; + translating_gm.read_at(0x1000, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + #[test] + fn test_translating_memory_abort() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + // STE in abort mode. + write_ste(&gm, sid, &make_abort_ste()); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Read should fail. + let mut buf = vec![0u8; 4]; + let result = translating_gm.read_at(0, &mut buf); + assert!(result.is_err()); + + // Should have written an event to the EVTQ. + assert_eq!(evtq_event_count(&state), 1); + } + + #[test] + fn test_translating_memory_unmapped() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + // Set up STE and CD, but NO page table entries (L1 is all zeros). + write_ste(&gm, sid, &make_s1_ste(CD_BASE)); + write_cd(&gm, CD_BASE, 0); + // L1 is all zeros → translation fault. + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + let mut buf = vec![0u8; 4]; + let result = translating_gm.read_at(0, &mut buf); + assert!(result.is_err()); + + // Should have written a fault event to the EVTQ. + assert_eq!(evtq_event_count(&state), 1); + // Read the event from the EVTQ in guest memory. + let written: EvtEntry = gm.read_plain(EVTQ_BASE).expect("read event"); + assert_eq!(written.event_id(), EventId::F_TRANSLATION); + } + + #[test] + fn test_translating_memory_unassigned_bus() { + let gm = GuestMemory::allocate(0x5000_0000); + + // Write data at GPA 0x2000. + let data = b"unassigned bus data"; + gm.write_at(0x2000, data).unwrap(); + + let state = make_shared_state(&gm); + // Bus range NOT assigned (secondary_bus = 0). + let bus_range = AssignedBusRange::new(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Should bypass translation (IOVA = GPA). + let mut buf = vec![0u8; data.len()]; + translating_gm.read_at(0x2000, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + #[test] + fn test_translating_memory_smmu_disabled() { + let gm = GuestMemory::allocate(0x5000_0000); + + // Write data at GPA 0x3000. + let data = b"disabled smmu"; + gm.write_at(0x3000, data).unwrap(); + + let state = SmmuSharedState::new(gm.clone(), None, None); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi); + + // Should bypass translation. + let mut buf = vec![0u8; data.len()]; + translating_gm.read_at(0x3000, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + // ========================================================================= + // SmmuSignalMsi tests + // ========================================================================= + + #[test] + fn test_signal_msi_translated() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + setup_translation(&gm, sid); + + // Also map a doorbell page: IOVA 0x800 → DATA_GPA + 0x1000. + write_pt_desc(&gm, PT_L3_BASE + 8, page_desc(DATA_GPA + 0x1000)); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (_gm, smmu_msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone()); + + // Fire MSI with IOVA address 0x1040 (page 1 + offset 0x40). + // devid is a RID — the SMMU combines it with segment to get the SID. + smmu_msi.signal_msi(Some(TEST_RID), 0x1040, 0xDEAD); + + let calls = mock_msi.take_calls(); + assert_eq!(calls.len(), 1); + // Translated address: DATA_GPA + 0x1000 + 0x40. + assert_eq!(calls[0], (Some(TEST_RID), DATA_GPA + 0x1040, 0xDEAD)); + } + + #[test] + fn test_signal_msi_bypass() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + write_ste(&gm, sid, &make_bypass_ste()); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (_gm, smmu_msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone()); + + // MsiTarget resolves devid to a BDF before calling SmmuSignalMsi. + smmu_msi.signal_msi(Some(TEST_RID), 0xFEE0_0000, 0x42); + + let calls = mock_msi.take_calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0], (Some(TEST_RID), 0xFEE0_0000, 0x42)); + } + + #[test] + fn test_signal_msi_unmapped() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + // STE with S1 translation, but no page table entries. + write_ste(&gm, sid, &make_s1_ste(CD_BASE)); + write_cd(&gm, CD_BASE, 0); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (_gm, smmu_msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone()); + + // Fire MSI with unmapped address. devid is a RID. + smmu_msi.signal_msi(Some(TEST_RID), 0x1000, 0x42); + + // MSI should NOT be forwarded. + let calls = mock_msi.take_calls(); + assert!(calls.is_empty()); + + // Fault event should be written to the EVTQ. + assert_eq!(evtq_event_count(&state), 1); + } + + #[test] + fn test_signal_msi_devid_passthrough() { + let gm = GuestMemory::allocate(0x5000_0000); + let sid = expected_sid(); + + write_ste(&gm, sid, &make_bypass_ste()); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (_gm, smmu_msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone()); + + // devid (RID) should be passed through unchanged to the inner MSI. + smmu_msi.signal_msi(Some(TEST_RID), 0x1000, 0x42); + + let calls = mock_msi.take_calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].0, Some(TEST_RID)); + } + + #[test] + fn test_signal_msi_no_devid() { + let gm = GuestMemory::allocate(0x5000_0000); + + let state = make_shared_state(&gm); + let bus_range = make_bus_range(); + let mock_msi = MockSignalMsi::new(); + + let (_gm, smmu_msi) = + state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone()); + + // devid=None means no BDF — MSI should be dropped. + smmu_msi.signal_msi(None, 0xFEE0_0000, 0x42); + + let calls = mock_msi.take_calls(); + assert_eq!(calls.len(), 0); + } + + // ========================================================================= + // Stream ID remapping tests (non-zero stream_id_base) + // ========================================================================= + + #[test] + fn test_translating_memory_nonzero_stream_id_base() { + let gm = GuestMemory::allocate(0x5000_0000); + + // Use a non-zero stream_id_base (simulating a second root complex + // with its own region in the SMMU stream table). + // stream_id_base=256, bus=1 → SID = 256 + 256 = 512 (within 1024). + let stream_id_base: u32 = 256; + let bus: u8 = 1; + let sid = stream_id_base + ((bus as u32) << 8); + + // Set up translation for the remapped stream ID. + write_ste(&gm, sid, &make_s1_ste(CD_BASE)); + write_cd(&gm, CD_BASE, 0); + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA)); + + let data = b"remapped sid test"; + gm.write_at(DATA_GPA, data).unwrap(); + + let state = make_shared_state(&gm); + let bus_range = AssignedBusRange::new(); + bus_range.set_bus_range(bus, bus); + let mock_msi = MockSignalMsi::new(); + + let (translating_gm, _msi) = + state.create_device_context(bus_range, stream_id_base, &gm, mock_msi); + + // Read via IOVA 0 → should find the STE at the remapped stream ID. + let mut buf = vec![0u8; data.len()]; + translating_gm.read_at(0, &mut buf).unwrap(); + assert_eq!(&buf, data); + } + + #[test] + fn test_signal_msi_nonzero_stream_id_base() { + let gm = GuestMemory::allocate(0x5000_0000); + + // Non-zero base (different root complex). + let stream_id_base: u32 = 256; + let bus: u8 = 1; + let sid = stream_id_base + ((bus as u32) << 8); + + // Set up bypass STE for the remapped stream ID. + write_ste(&gm, sid, &make_bypass_ste()); + + let state = make_shared_state(&gm); + let bus_range = AssignedBusRange::new(); + bus_range.set_bus_range(bus, bus); + let mock_msi = MockSignalMsi::new(); + + let (_gm, smmu_msi) = + state.create_device_context(bus_range, stream_id_base, &gm, mock_msi.clone()); + + // Fire MSI — bypass mode means address passes through unchanged. + let rid = (bus as u32) << 8; + smmu_msi.signal_msi(Some(rid), 0xFEE0_0000, 0x99); + + let calls = mock_msi.take_calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0], (Some(rid), 0xFEE0_0000, 0x99)); + } +} diff --git a/vm/devices/iommu/smmu/src/spec/cd.rs b/vm/devices/iommu/smmu/src/spec/cd.rs new file mode 100644 index 0000000000..9d4d827df3 --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/cd.rs @@ -0,0 +1,445 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 Context Descriptor (CD) definitions. +//! +//! Each CD is 64 bytes (512 bits). The CD contains stage 1 translation table +//! pointers and ASID for a given stream/substream. + +use bitfield_struct::bitfield; +use open_enum::open_enum; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; + +/// Context descriptor size in bytes. +pub const CD_SIZE: usize = 64; + +/// Context descriptor (64 bytes). +#[repr(C)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct Cd { + /// Quadword 0: T0SZ, TG0, cacheability, EPD0, V, IPS, ASID, etc. + pub qw0: CdDw0, + /// Quadword 1: TTB0. + pub qw1: CdDw1, + /// Quadword 2: TTB1 (unused for TTB0-only translation). + pub _qw2: u64, + /// MAIR0 (Memory Attribute Indirection Register 0). + pub mair0: u64, + /// MAIR1 (Memory Attribute Indirection Register 1). + pub mair1: u64, + /// Quadwords 5-7: AMAIR, PARTID, permission indirection, etc. + pub _qw5_7: [u64; 3], +} + +impl Cd { + /// Returns true if the CD is valid (V bit set). + pub fn valid(&self) -> bool { + self.qw0.v() + } + + /// Returns the TTB0 physical address. + /// + /// TTB0 is stored in QW1 as address bits `[55:4]`. + /// The actual address is the stored value shifted left by 4. + pub fn ttb0(&self) -> u64 { + self.qw1.ttb0() << 4 + } + + /// Returns T0SZ (VA region size for TTB0). + pub fn t0sz(&self) -> u8 { + self.qw0.t0sz() + } + + /// Returns TG0 (granule size for TTB0). + pub fn tg0(&self) -> Tg0 { + Tg0(self.qw0.tg0()) + } + + /// Returns IPS (intermediate physical address size). + pub fn ips(&self) -> Ips { + Ips(self.qw0.ips()) + } + + /// Returns the ASID. + pub fn asid(&self) -> u16 { + self.qw0.asid() + } + + /// Returns true if AA64 mode (VMSAv8-64) is selected. + pub fn aa64(&self) -> bool { + self.qw0.aa64() + } + + /// Returns true if TTB0 walks are disabled (EPD0=1). + pub fn epd0(&self) -> bool { + self.qw0.epd0() + } +} + +/// CD QW0 (bits `[63:0]`): T0SZ, TG0, cacheability, EPD0, V, IPS, ASID, etc. +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct CdDw0 { + /// VA region size for TTB0. VA range = 2^(64 - T0SZ). + #[bits(6)] + pub t0sz: u8, + /// TTB0 granule size. + #[bits(2)] + pub tg0: u8, + /// TTB0 inner cacheability. + #[bits(2)] + pub ir0: u8, + /// TTB0 outer cacheability. + #[bits(2)] + pub or0: u8, + /// TTB0 shareability. + #[bits(2)] + pub sh0: u8, + /// Disable TTB0 walk (1 = fault on miss). + pub epd0: bool, + /// Translation table endianness (0=LE, 1=BE). + pub endi: bool, + /// VA region size for TTB1. + #[bits(6)] + pub t1sz: u8, + /// TTB1 granule size. + #[bits(2)] + pub tg1: u8, + /// TTB1 inner cacheability. + #[bits(2)] + pub ir1: u8, + /// TTB1 outer cacheability. + #[bits(2)] + pub or1: u8, + /// TTB1 shareability. + #[bits(2)] + pub sh1: u8, + /// Disable TTB1 walk. + pub epd1: bool, + /// CD valid bit. + pub v: bool, + /// Intermediate physical address size. + #[bits(3)] + pub ips: u8, + /// Access flag fault disable. + pub affd: bool, + /// HW dirty bit management. + pub hd: bool, + /// HW access flag update. + pub ha: bool, + /// Stall (0=terminate, 1=stall on fault). + pub s: bool, + /// Non-shareable → OSH upgrade. + pub r: bool, + /// Abort flag. + pub a: bool, + /// ASID set (for TLB invalidation). + pub aset: bool, + /// Top byte ignore for TTB0 addresses. + pub tbi0: bool, + /// Top byte ignore for TTB1 addresses. + pub tbi1: bool, + /// Privileged Access Never. + pub pan: bool, + /// VMSAv8-64 mode (must be 1 for AArch64 page tables). + pub aa64: bool, + /// Write implies XN. + pub wxn: bool, + /// Unprivileged write implies XN. + pub uwxn: bool, + /// ASID (16-bit). + #[bits(16)] + pub asid: u16, +} + +/// CD QW1 (bits `[127:64]`): TTB0. +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct CdDw1 { + /// Control bits (HAFT, E0PD0, NSCFG0, DisCH0). + #[bits(4)] + pub control: u8, + /// TTB0 address bits `[55:4]`. Actual address = stored << 4. + #[bits(52)] + pub ttb0: u64, + /// HW use fields (HWU0xx). + #[bits(4)] + pub hwu: u8, + /// SKL0 (start level override, if supported). + #[bits(2)] + pub skl0: u8, + #[bits(2)] + _reserved: u64, +} + +open_enum! { + /// TTB0 granule size (CD DW0 TG0 field). + pub enum Tg0: u8 { + /// 4KB granule. + GRAN_4K = 0b00, + /// 64KB granule. + GRAN_64K = 0b01, + /// 16KB granule. + GRAN_16K = 0b10, + } +} + +open_enum! { + /// Intermediate Physical Address Size (CD DW0 IPS field). + pub enum Ips: u8 { + /// 32-bit (4GB). + IPS_32 = 0b000, + /// 36-bit (64GB). + IPS_36 = 0b001, + /// 40-bit (1TB). + IPS_40 = 0b010, + /// 42-bit (4TB). + IPS_42 = 0b011, + /// 44-bit (16TB). + IPS_44 = 0b100, + /// 48-bit (256TB). + IPS_48 = 0b101, + /// 52-bit (4PB). + IPS_52 = 0b110, + } +} + +impl Ips { + /// Returns the number of physical address bits for this IPS value, + /// or `None` if the value is not a recognized encoding. + pub fn bits(self) -> Option { + Some(match self { + Self::IPS_32 => 32, + Self::IPS_36 => 36, + Self::IPS_40 => 40, + Self::IPS_42 => 42, + Self::IPS_44 => 44, + Self::IPS_48 => 48, + Self::IPS_52 => 52, + _ => return None, + }) + } +} + +impl Tg0 { + /// Returns the granule size in bytes, or `None` if the value is not + /// a recognized encoding. + pub fn granule_size(self) -> Option { + Some(match self { + Self::GRAN_4K => 4096, + Self::GRAN_16K => 16384, + Self::GRAN_64K => 65536, + _ => return None, + }) + } + + /// Returns the number of bits per page table level index, or `None` + /// if the value is not a recognized encoding. + pub fn bits_per_level(self) -> Option { + Some(match self { + Self::GRAN_4K => 9, + Self::GRAN_16K => 11, + Self::GRAN_64K => 13, + _ => return None, + }) + } + + /// Returns the page offset bits (log2 of granule size), or `None` + /// if the value is not a recognized encoding. + pub fn page_shift(self) -> Option { + Some(match self { + Self::GRAN_4K => 12, + Self::GRAN_16K => 14, + Self::GRAN_64K => 16, + _ => return None, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cd_size() { + assert_eq!(size_of::(), CD_SIZE); + } + + #[test] + fn test_cd_valid() { + let cd = new_cd(); + assert!(!cd.valid()); + + let cd = Cd { + qw0: CdDw0::new().with_v(true), + ..new_cd() + }; + assert!(cd.valid()); + } + + #[test] + fn test_cd_dw0_fields() { + let dw0 = CdDw0::new() + .with_t0sz(16) + .with_tg0(Tg0::GRAN_4K.0) + .with_ir0(0b01) // WB + .with_or0(0b01) + .with_sh0(0b11) // ISH + .with_v(true) + .with_ips(Ips::IPS_40.0) + .with_aa64(true) + .with_asid(42); + + assert_eq!(dw0.t0sz(), 16); + assert_eq!(dw0.tg0(), Tg0::GRAN_4K.0); + assert_eq!(dw0.ir0(), 0b01); + assert_eq!(dw0.or0(), 0b01); + assert_eq!(dw0.sh0(), 0b11); + assert!(dw0.v()); + assert_eq!(dw0.ips(), Ips::IPS_40.0); + assert!(dw0.aa64()); + assert_eq!(dw0.asid(), 42); + assert!(!dw0.epd0()); + } + + fn new_cd() -> Cd { + Cd { + qw0: CdDw0::new(), + qw1: CdDw1::new(), + _qw2: 0, + mair0: 0, + mair1: 0, + _qw5_7: [0; 3], + } + } + + #[test] + fn test_cd_ttb0_address() { + let ttb0_addr: u64 = 0x4000_0000; + let cd = Cd { + qw1: CdDw1::new().with_ttb0(ttb0_addr >> 4), + ..new_cd() + }; + assert_eq!(cd.ttb0(), ttb0_addr); + } + + #[test] + fn test_cd_ttb0_large_address() { + let ttb0_addr: u64 = 0x00FF_FFFF_F000; + let cd = Cd { + qw1: CdDw1::new().with_ttb0(ttb0_addr >> 4), + ..new_cd() + }; + assert_eq!(cd.ttb0(), ttb0_addr); + } + + #[test] + fn test_cd_full_roundtrip() { + let ttb0_addr: u64 = 0x8000_0000; + let cd = Cd { + qw0: CdDw0::new() + .with_t0sz(32) + .with_tg0(Tg0::GRAN_4K.0) + .with_ir0(0b01) + .with_or0(0b01) + .with_sh0(0b11) + .with_v(true) + .with_ips(Ips::IPS_40.0) + .with_aa64(true) + .with_asid(100), + qw1: CdDw1::new().with_ttb0(ttb0_addr >> 4), + mair0: 0xFF44_0C04_00BB_FF00, + ..new_cd() + }; + + assert!(cd.valid()); + assert_eq!(cd.t0sz(), 32); + assert_eq!(cd.tg0(), Tg0::GRAN_4K); + assert_eq!(cd.ips(), Ips::IPS_40); + assert!(cd.aa64()); + assert_eq!(cd.asid(), 100); + assert_eq!(cd.ttb0(), ttb0_addr); + assert_eq!(cd.mair0, 0xFF44_0C04_00BB_FF00); + } + + #[test] + fn test_tg0_granule_sizes() { + assert_eq!(Tg0::GRAN_4K.granule_size(), Some(4096)); + assert_eq!(Tg0::GRAN_16K.granule_size(), Some(16384)); + assert_eq!(Tg0::GRAN_64K.granule_size(), Some(65536)); + assert_eq!(Tg0(0b11).granule_size(), None); + } + + #[test] + fn test_tg0_bits_per_level() { + assert_eq!(Tg0::GRAN_4K.bits_per_level(), Some(9)); + assert_eq!(Tg0::GRAN_16K.bits_per_level(), Some(11)); + assert_eq!(Tg0::GRAN_64K.bits_per_level(), Some(13)); + } + + #[test] + fn test_tg0_page_shift() { + assert_eq!(Tg0::GRAN_4K.page_shift(), Some(12)); + assert_eq!(Tg0::GRAN_16K.page_shift(), Some(14)); + assert_eq!(Tg0::GRAN_64K.page_shift(), Some(16)); + } + + #[test] + fn test_ips_bits() { + assert_eq!(Ips::IPS_32.bits(), Some(32)); + assert_eq!(Ips::IPS_36.bits(), Some(36)); + assert_eq!(Ips::IPS_40.bits(), Some(40)); + assert_eq!(Ips::IPS_42.bits(), Some(42)); + assert_eq!(Ips::IPS_44.bits(), Some(44)); + assert_eq!(Ips::IPS_48.bits(), Some(48)); + assert_eq!(Ips::IPS_52.bits(), Some(52)); + assert_eq!(Ips(0b111).bits(), None); + } + + #[test] + fn test_cd_invalid() { + let cd = new_cd(); + assert!(!cd.valid()); + } + + #[test] + fn test_cd_epd0_disables_walk() { + let cd = Cd { + qw0: CdDw0::new().with_v(true).with_epd0(true), + ..new_cd() + }; + + assert!(cd.valid()); + assert!(cd.epd0()); + } + + #[test] + fn test_translation_context_from_cd() { + let cd = Cd { + qw0: CdDw0::new() + .with_t0sz(16) // 48-bit VA + .with_tg0(Tg0::GRAN_4K.0) + .with_ips(Ips::IPS_48.0) + .with_v(true) + .with_aa64(true), + ..new_cd() + }; + + let tg0 = cd.tg0(); + let va_bits = 64 - cd.t0sz() as u32; + let page_shift = tg0.page_shift().unwrap() as u32; + let bits_per_level = tg0.bits_per_level().unwrap() as u32; + + assert_eq!(va_bits, 48); + assert_eq!(page_shift, 12); + assert_eq!(bits_per_level, 9); + + // For 4K/48-bit: start at level 0, 4 levels + let total_bits = va_bits - page_shift; + let num_levels = total_bits.div_ceil(bits_per_level); + assert_eq!(num_levels, 4); + } +} diff --git a/vm/devices/iommu/smmu/src/spec/commands.rs b/vm/devices/iommu/smmu/src/spec/commands.rs new file mode 100644 index 0000000000..871c600fc7 --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/commands.rs @@ -0,0 +1,299 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 command queue entry definitions. +//! +//! Command queue entries are 16 bytes (128 bits). The opcode is in bits `[7:0]` +//! of the first dword. + +use bitfield_struct::bitfield; +use open_enum::open_enum; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; + +open_enum! { + /// Command queue opcodes. + #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] + pub enum CmdOpcode: u8 { + /// Prefetch configuration. + PREFETCH_CFG = 0x01, + /// Invalidate cached STE. + CFGI_STE = 0x03, + /// Invalidate cached STE range (with Range=31 for ALL). + CFGI_STE_RANGE = 0x04, + /// Invalidate cached context descriptor. + CFGI_CD = 0x05, + /// Invalidate all cached CDs for a stream. + CFGI_CD_ALL = 0x06, + /// Invalidate all non-Hyp TLB entries. + TLBI_NH_ALL = 0x10, + /// Invalidate non-Hyp TLB entries by ASID. + TLBI_NH_ASID = 0x11, + /// Invalidate non-Hyp TLB entry by VA. + TLBI_NH_VA = 0x12, + /// Invalidate non-Hyp TLB entry by VA (all ASIDs). + TLBI_NH_VAA = 0x13, + /// Invalidate all stage 1+2 TLB entries for a VMID. + TLBI_S12_VMALL = 0x28, + /// Invalidate all non-secure non-Hyp TLB entries. + TLBI_NSNH_ALL = 0x30, + /// Synchronization command. + CMD_SYNC = 0x46, + } +} + +/// Raw command queue entry (16 bytes = 2 quadwords). +/// +/// Commands are parsed by reading the opcode from the first byte of `qw0`, +/// then interpreting the remaining fields based on the command type. +#[repr(C)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct CmdEntry { + /// First quadword — contains the opcode and command-specific fields. + pub qw0: u64, + /// Second quadword — contains address or other extended fields. + pub qw1: u64, +} + +impl CmdEntry { + /// Returns the command opcode (bits `[7:0]` of qw0). + pub fn opcode(&self) -> CmdOpcode { + CmdOpcode((self.qw0 & 0xFF) as u8) + } +} + +/// CMD_CFGI_STE (opcode 0x03): Invalidate cached STE. +#[bitfield(u64)] +pub struct CmdCfgiSte { + /// Opcode (bits `[7:0]`). + #[bits(8)] + pub opcode: u8, + /// SSec (bit 8) — non-secure. + pub ssec: bool, + #[bits(23)] + _reserved0: u32, + /// StreamID (bits `[63:32]`). + #[bits(32)] + pub sid: u32, +} + +/// CMD_CFGI_STE_RANGE (opcode 0x04): Invalidate cached STE range. +/// +/// When Range=31, this is CMD_CFGI_ALL (invalidate all STEs). +#[bitfield(u64)] +pub struct CmdCfgiSteRange { + /// Opcode (bits `[7:0]`). + #[bits(8)] + pub opcode: u8, + /// SSec (bit 8) — non-secure. + pub ssec: bool, + #[bits(23)] + _reserved0: u32, + /// StreamID (bits `[63:32]`). + #[bits(32)] + pub sid: u32, +} + +impl CmdCfgiSteRange { + /// The range field is in bits `[68:64]` of the full 128-bit entry (low bits of qw1). + pub fn range_from_entry(entry: &CmdEntry) -> u8 { + (entry.qw1 & 0x1F) as u8 + } + + /// Range=31 means invalidate ALL STEs. + pub const RANGE_ALL: u8 = 31; +} + +/// CMD_CFGI_CD (opcode 0x05): Invalidate cached context descriptor. +#[bitfield(u64)] +pub struct CmdCfgiCd { + /// Opcode (bits `[7:0]`). + #[bits(8)] + pub opcode: u8, + /// SSec (bit 8) — non-secure. + pub ssec: bool, + #[bits(3)] + _reserved0: u32, + /// SubstreamID (bits `[31:12]`). + #[bits(20)] + pub ssid: u32, + /// StreamID (bits `[63:32]`). + #[bits(32)] + pub sid: u32, +} + +/// CMD_TLBI_NH_ASID (opcode 0x11): Invalidate TLB by ASID. +#[bitfield(u64)] +pub struct CmdTlbiNhAsid { + /// Opcode (bits `[7:0]`). + #[bits(8)] + pub opcode: u8, + #[bits(24)] + _reserved0: u32, + /// VMID (bits `[47:32]`). + #[bits(16)] + pub vmid: u16, + /// ASID (bits `[63:48]`). + #[bits(16)] + pub asid: u16, +} + +/// CMD_TLBI_NH_VA (opcode 0x12): Invalidate TLB by virtual address. +#[bitfield(u64)] +pub struct CmdTlbiNhVa { + /// Opcode (bits `[7:0]`). + #[bits(8)] + pub opcode: u8, + #[bits(24)] + _reserved0: u32, + /// VMID (bits `[47:32]`). + #[bits(16)] + pub vmid: u16, + /// ASID (bits `[63:48]`). + #[bits(16)] + pub asid: u16, +} + +impl CmdTlbiNhVa { + /// The address field is in bits `[127:68]` of the full 128-bit entry. + /// This extracts the VA from the raw entry (address bits `[63:12]`). + pub fn addr_from_entry(entry: &CmdEntry) -> u64 { + let shifted = entry.qw1 >> 4; // bits [127:68] → bits [59:0] + (shifted & ((1u64 << 52) - 1)) << 12 + } + + /// Leaf bit is at bit 64 of the 128-bit entry (bit 0 of qw1). + pub fn leaf_from_entry(entry: &CmdEntry) -> bool { + entry.qw1 & 1 != 0 + } +} + +/// CMD_SYNC (opcode 0x46): Synchronization command. +#[bitfield(u64)] +pub struct CmdSync { + /// Opcode (bits `[7:0]`). + #[bits(8)] + pub opcode: u8, + #[bits(4)] + _reserved0: u32, + /// Completion signal type (bits `[13:12]`). + #[bits(2)] + pub cs: u8, + #[bits(8)] + _reserved1: u32, + /// MSI shareability (bits `[23:22]`). + #[bits(2)] + pub msh: u8, + /// MSI attributes (bits `[27:24]`). + #[bits(4)] + pub msi_attr: u8, + #[bits(4)] + _reserved2: u32, + /// MSI data (bits `[63:32]`). + #[bits(32)] + pub msi_data: u32, +} + +impl CmdSync { + /// Extract the MSI address from the full 128-bit command entry. + /// MSI address is in bits `[119:66]` → address `[55:2]`. + pub fn msi_addr_from_entry(entry: &CmdEntry) -> u64 { + let shifted = entry.qw1 >> 2; // bits [119:66] → bits [53:0] + shifted & ((1u64 << 54) - 1) + } + + /// Returns the full MSI address (with bits `[1:0]` = 0). + pub fn msi_write_addr_from_entry(entry: &CmdEntry) -> u64 { + Self::msi_addr_from_entry(entry) << 2 + } +} + +open_enum! { + /// CMD_SYNC completion signal types. + pub enum SyncCs: u8 { + /// No signal. + SIG_NONE = 0b00, + /// Send MSI/IRQ. + SIG_IRQ = 0b01, + /// Send SEV wakeup event. + SIG_SEV = 0b10, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cmd_opcodes() { + assert_eq!(CmdOpcode::PREFETCH_CFG.0, 0x01); + assert_eq!(CmdOpcode::CFGI_STE.0, 0x03); + assert_eq!(CmdOpcode::CFGI_STE_RANGE.0, 0x04); + assert_eq!(CmdOpcode::CFGI_CD.0, 0x05); + assert_eq!(CmdOpcode::CFGI_CD_ALL.0, 0x06); + assert_eq!(CmdOpcode::TLBI_NH_ALL.0, 0x10); + assert_eq!(CmdOpcode::TLBI_NH_ASID.0, 0x11); + assert_eq!(CmdOpcode::TLBI_NH_VA.0, 0x12); + assert_eq!(CmdOpcode::TLBI_NH_VAA.0, 0x13); + assert_eq!(CmdOpcode::TLBI_NSNH_ALL.0, 0x30); + assert_eq!(CmdOpcode::CMD_SYNC.0, 0x46); + } + + #[test] + fn test_cmd_entry_opcode() { + let entry = CmdEntry { qw0: 0x46, qw1: 0 }; + assert_eq!(entry.opcode(), CmdOpcode::CMD_SYNC); + } + + #[test] + fn test_cmd_cfgi_ste_sid() { + let cmd = CmdCfgiSte::new() + .with_opcode(CmdOpcode::CFGI_STE.0) + .with_sid(42); + assert_eq!(cmd.opcode(), CmdOpcode::CFGI_STE.0); + assert_eq!(cmd.sid(), 42); + } + + #[test] + fn test_cmd_sync_fields() { + let cmd = CmdSync::new() + .with_opcode(CmdOpcode::CMD_SYNC.0) + .with_cs(SyncCs::SIG_IRQ.0) + .with_msi_data(0xDEAD_BEEF); + assert_eq!(cmd.opcode(), CmdOpcode::CMD_SYNC.0); + assert_eq!(cmd.cs(), SyncCs::SIG_IRQ.0); + assert_eq!(cmd.msi_data(), 0xDEAD_BEEF); + } + + #[test] + fn test_cmd_sync_msi_addr() { + // MSI address = 0x1234_5678 + // Stored in qw1 bits [55:2] as (addr >> 2) << 2 + let addr: u64 = 0x1234_5678; + let addr_shifted = addr >> 2; + let entry = CmdEntry { + qw0: CmdOpcode::CMD_SYNC.0 as u64, + qw1: addr_shifted << 2, + }; + assert_eq!(CmdSync::msi_write_addr_from_entry(&entry), addr & !0x3); + } + + #[test] + fn test_cfgi_ste_range_all() { + let entry = CmdEntry { + qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64, + qw1: 31, + }; + assert_eq!( + CmdCfgiSteRange::range_from_entry(&entry), + CmdCfgiSteRange::RANGE_ALL + ); + } + + #[test] + fn test_cmd_entry_size() { + assert_eq!(size_of::(), 16); + } +} diff --git a/vm/devices/iommu/smmu/src/spec/events.rs b/vm/devices/iommu/smmu/src/spec/events.rs new file mode 100644 index 0000000000..3c85f01167 --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/events.rs @@ -0,0 +1,265 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 event queue entry definitions. +//! +//! Event queue entries are 32 bytes (256 bits). The event type is in bits `[7:0]` +//! of the first dword. + +use bitfield_struct::bitfield; +use open_enum::open_enum; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; + +open_enum! { + /// Event queue record types. + #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] + pub enum EventId: u8 { + /// Unsupported upstream transaction. + F_UUT = 0x01, + /// StreamID out of range. + C_BAD_STREAMID = 0x02, + /// STE fetch external abort. + F_STE_FETCH = 0x03, + /// Bad STE configuration. + C_BAD_STE = 0x04, + /// Bad ATS translation request. + F_BAD_ATS_TREQ = 0x05, + /// Stream disabled. + F_STREAM_DISABLED = 0x06, + /// ATS translated traffic forbidden. + F_TRANSL_FORBIDDEN = 0x07, + /// Bad SubstreamID. + C_BAD_SUBSTREAMID = 0x08, + /// CD fetch external abort. + F_CD_FETCH = 0x09, + /// Bad CD configuration. + C_BAD_CD = 0x0A, + /// Translation table walk external abort. + F_WALK_EABT = 0x0B, + /// Translation fault. + F_TRANSLATION = 0x10, + /// Address size fault. + F_ADDR_SIZE = 0x11, + /// Access flag fault. + F_ACCESS = 0x12, + /// Permission fault. + F_PERMISSION = 0x13, + /// TLB conflict. + F_TLB_CONFLICT = 0x14, + } +} + +/// Event queue entry (32 bytes). +#[repr(C)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct EvtEntry { + /// Event type and SubstreamID info. + pub header: EvtHeader, + /// StreamID of the faulting device. + pub sid: u32, + /// Fault flags (RnW, S2, CLASS, etc.). + pub flags: EvtFlags, + /// Reserved / STAG. + pub _stag: u32, + /// Faulting input address (64-bit). + pub input_addr: u64, + /// Fetch address or reserved (64-bit). + pub _fetch_addr: u64, +} + +/// Event entry header (first 32 bits). +#[bitfield(u32)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct EvtHeader { + /// Event type. + #[bits(8)] + pub event_id: u8, + #[bits(2)] + _reserved0: u32, + /// SubstreamID valid. + pub ssv: bool, + #[bits(1)] + _reserved1: u32, + /// SubstreamID (upper bits). + #[bits(20)] + pub ssid: u32, +} + +/// Event entry flags (third 32-bit word). +#[bitfield(u32)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct EvtFlags { + /// Privileged/Unprivileged. + pub pnu: bool, + /// Instruction/Data. + pub ind: bool, + /// Read (true) / Write (false). + pub rnw: bool, + /// Stage 2 fault (false = S1 fault). + pub s2: bool, + /// Fault class. + #[bits(2)] + pub class: u8, + #[bits(26)] + _reserved: u32, +} + +impl EvtEntry { + /// Size of an event queue entry in bytes. + pub const SIZE: usize = 32; + + /// Creates a new zeroed event entry. + pub fn new() -> Self { + Self { + header: EvtHeader::new(), + sid: 0, + flags: EvtFlags::new(), + _stag: 0, + input_addr: 0, + _fetch_addr: 0, + } + } + + /// Returns the event type. + pub fn event_id(&self) -> EventId { + EventId(self.header.event_id()) + } + + /// Creates a translation fault event. + pub fn translation_fault(sid: u32, iova: u64, write: bool) -> Self { + Self { + header: EvtHeader::new().with_event_id(EventId::F_TRANSLATION.0), + sid, + flags: EvtFlags::new().with_rnw(!write), + input_addr: iova, + ..Self::new() + } + } + + /// Creates a permission fault event. + pub fn permission_fault(sid: u32, iova: u64, write: bool) -> Self { + Self { + header: EvtHeader::new().with_event_id(EventId::F_PERMISSION.0), + sid, + flags: EvtFlags::new().with_rnw(!write), + input_addr: iova, + ..Self::new() + } + } + + /// Creates an access flag fault event. + pub fn access_fault(sid: u32, iova: u64, write: bool) -> Self { + Self { + header: EvtHeader::new().with_event_id(EventId::F_ACCESS.0), + sid, + flags: EvtFlags::new().with_rnw(!write), + input_addr: iova, + ..Self::new() + } + } + + /// Creates an address size fault event. + pub fn addr_size_fault(sid: u32, iova: u64, write: bool) -> Self { + Self { + header: EvtHeader::new().with_event_id(EventId::F_ADDR_SIZE.0), + sid, + flags: EvtFlags::new().with_rnw(!write), + input_addr: iova, + ..Self::new() + } + } + + /// Creates a bad STE event. + pub fn bad_ste(sid: u32) -> Self { + Self { + header: EvtHeader::new().with_event_id(EventId::C_BAD_STE.0), + sid, + ..Self::new() + } + } + + /// Creates a bad CD event. + pub fn bad_cd(sid: u32) -> Self { + Self { + header: EvtHeader::new().with_event_id(EventId::C_BAD_CD.0), + sid, + ..Self::new() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_event_ids() { + assert_eq!(EventId::F_UUT.0, 0x01); + assert_eq!(EventId::C_BAD_STREAMID.0, 0x02); + assert_eq!(EventId::C_BAD_STE.0, 0x04); + assert_eq!(EventId::C_BAD_CD.0, 0x0A); + assert_eq!(EventId::F_TRANSLATION.0, 0x10); + assert_eq!(EventId::F_ADDR_SIZE.0, 0x11); + assert_eq!(EventId::F_ACCESS.0, 0x12); + assert_eq!(EventId::F_PERMISSION.0, 0x13); + } + + #[test] + fn test_evt_entry_size() { + assert_eq!(size_of::(), 32); + } + + #[test] + fn test_evt_entry_translation_fault() { + let evt = EvtEntry::translation_fault(0x42, 0x1000_2000, true); + assert_eq!(evt.event_id(), EventId::F_TRANSLATION); + assert_eq!(evt.sid, 0x42); + assert_eq!(evt.input_addr, 0x1000_2000); + // write → RnW = false (not-read) + assert!(!evt.flags.rnw()); + } + + #[test] + fn test_evt_entry_permission_fault() { + let evt = EvtEntry::permission_fault(0x10, 0xFFFF_0000, false); + assert_eq!(evt.event_id(), EventId::F_PERMISSION); + assert_eq!(evt.sid, 0x10); + assert_eq!(evt.input_addr, 0xFFFF_0000); + // read → RnW = true + assert!(evt.flags.rnw()); + } + + #[test] + fn test_evt_entry_bad_ste() { + let evt = EvtEntry::bad_ste(0x100); + assert_eq!(evt.event_id(), EventId::C_BAD_STE); + assert_eq!(evt.sid, 0x100); + } + + #[test] + fn test_evt_entry_access_fault() { + let evt = EvtEntry::access_fault(5, 0xDEAD_BEEF_0000, true); + assert_eq!(evt.event_id(), EventId::F_ACCESS); + assert_eq!(evt.sid, 5); + assert_eq!(evt.input_addr, 0xDEAD_BEEF_0000); + } + + #[test] + fn test_evt_entry_roundtrip() { + let evt = EvtEntry { + header: EvtHeader::new().with_event_id(EventId::F_ADDR_SIZE.0), + sid: 0xABCD, + flags: EvtFlags::new().with_rnw(true), + input_addr: 0x1234_5678_9ABC_DEF0, + ..EvtEntry::new() + }; + + assert_eq!(evt.event_id(), EventId::F_ADDR_SIZE); + assert_eq!(evt.sid, 0xABCD); + assert_eq!(evt.input_addr, 0x1234_5678_9ABC_DEF0); + assert!(evt.flags.rnw()); + } +} diff --git a/vm/devices/iommu/smmu/src/spec/mod.rs b/vm/devices/iommu/smmu/src/spec/mod.rs new file mode 100644 index 0000000000..5cadef514f --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/mod.rs @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 spec-derived type definitions. +//! +//! Register layouts, stream table entries, context descriptors, command/event +//! queue entries, and page table descriptors — all derived from the Arm SMMUv3 +//! architecture specification (IHI 0070). +//! +//! This module contains only type definitions, not algorithms. + +pub mod cd; +pub mod commands; +pub mod events; +pub mod pt; +pub mod registers; +pub mod ste; diff --git a/vm/devices/iommu/smmu/src/spec/pt.rs b/vm/devices/iommu/smmu/src/spec/pt.rs new file mode 100644 index 0000000000..947bfc60ab --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/pt.rs @@ -0,0 +1,396 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! AArch64 VMSAv8 stage 1 page table descriptor definitions. +//! +//! The SMMU uses the same page table format as AArch64 PE stage 1 translation. +//! These are the standard ARMv8 translation table descriptors defined in the +//! Arm Architecture Reference Manual (DDI 0487). + +use bitfield_struct::bitfield; +use open_enum::open_enum; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; + +/// A 64-bit page table descriptor. +/// +/// The interpretation depends on the level and the Type bit: +/// - Level 0-2, Type=1: Table descriptor (points to next-level table) +/// - Level 1-2, Type=0: Block descriptor (maps a large region) +/// - Level 3, Type=1: Page descriptor (maps a single page) +/// - Level 3, Type=0: Reserved (invalid) +/// - Valid=0: Invalid/fault entry +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct PtDesc { + /// Valid bit. 0 = fault entry. + pub valid: bool, + /// Descriptor type. 1 = table/page, 0 = block (or reserved at L3). + pub desc_type: bool, + /// Memory attribute index (indexes into MAIR). + #[bits(3)] + pub attr_index: u8, + /// Non-secure bit. + pub ns: bool, + /// Access permissions. + #[bits(2)] + pub ap: u8, + /// Shareability. + #[bits(2)] + pub sh: u8, + /// Access flag. Must be 1 to avoid AF faults (when HTTU not supported). + pub af: bool, + /// Not-global (if 1, uses ASID for TLB matching). + pub ng: bool, + /// Output address / next-level table address bits `[47:12]`. + /// For 4KB granule: block at L1 uses `[47:30]`, block at L2 uses `[47:21]`, + /// page at L3 uses `[47:12]`. + #[bits(36)] + pub addr_bits: u64, + /// Reserved / upper attributes bits `[49:48]`. + #[bits(2)] + _reserved_upper: u64, + /// Guarded page. + pub gp: bool, + /// Dirty bit modifier. + pub dbm: bool, + /// Contiguous hint. + pub contiguous: bool, + /// Privileged execute-never. + pub pxn: bool, + /// Unprivileged execute-never (or XN for EL2/EL3). + pub uxn: bool, + /// Software use / PBHA. + #[bits(4)] + pub sw_use: u8, + /// Ignored / PBHA. + #[bits(5)] + pub ignored_upper: u8, +} + +impl PtDesc { + /// Returns true if this is a valid entry. + pub fn is_valid(&self) -> bool { + self.valid() + } + + /// Returns true if this is a table descriptor (levels 0-2) or page + /// descriptor (level 3). Type bit = 1. + pub fn is_table(&self) -> bool { + self.valid() && self.desc_type() + } + + /// Returns true if this is a block descriptor (levels 1-2). + /// Valid=1 and Type=0. + pub fn is_block(&self) -> bool { + self.valid() && !self.desc_type() + } + + /// Returns true if this is a page descriptor at level 3. + /// At L3, Valid=1 and Type=1 means page. Type=0 is reserved/fault. + pub fn is_page_at_l3(&self) -> bool { + self.valid() && self.desc_type() + } + + /// Returns the output address for a 4KB granule. + /// + /// For table descriptors: the next-level table address (bits `[47:12]`). + /// For block descriptors at L1: bits `[47:30]` (1GB block). + /// For block descriptors at L2: bits `[47:21]` (2MB block). + /// For page descriptors at L3: bits `[47:12]` (4KB page). + pub fn output_address_4k(&self, level: u8) -> u64 { + let raw = self.addr_bits() << 12; + match level { + 0 => raw, // table only at L0 for 4K + 1 => { + if self.is_block() { + raw & !((1u64 << 30) - 1) // 1GB aligned + } else { + raw // table address + } + } + 2 => { + if self.is_block() { + raw & !((1u64 << 21) - 1) // 2MB aligned + } else { + raw // table address + } + } + 3 => raw, // page address, 4KB aligned + _ => raw, + } + } + + /// Returns the output address for a 16KB granule. + pub fn output_address_16k(&self, level: u8) -> u64 { + let raw = self.addr_bits() << 12; + match level { + // L1 block: 64GB (bits [47:36]) + 1 => { + if self.is_block() { + raw & !((1u64 << 36) - 1) + } else { + raw + } + } + // L2 block: 32MB (bits [47:25]) + 2 => { + if self.is_block() { + raw & !((1u64 << 25) - 1) + } else { + raw + } + } + 3 => raw, // page address, 16KB aligned + _ => raw, + } + } + + /// Returns the output address for a 64KB granule. + pub fn output_address_64k(&self, level: u8) -> u64 { + let raw = self.addr_bits() << 12; + match level { + // L2 block: 512MB (bits [47:29]) + 2 => { + if self.is_block() { + raw & !((1u64 << 29) - 1) + } else { + raw + } + } + 3 => raw, // page address, 64KB aligned + _ => raw, + } + } + + /// Returns the next-level table address (for table descriptors). + /// The table address is always in bits `[47:12]`, page-aligned. + pub fn next_table_addr(&self) -> u64 { + self.addr_bits() << 12 + } +} + +open_enum! { + /// Access permission bits (AP`[2:1]`). + pub enum ApBits: u8 { + /// EL1 R/W, EL0 no access. + RW_EL1 = 0b00, + /// EL1 R/W, EL0 R/W. + RW_ANY = 0b01, + /// EL1 R/O, EL0 no access. + RO_EL1 = 0b10, + /// EL1 R/O, EL0 R/O. + RO_ANY = 0b11, + } +} + +impl ApBits { + /// Returns true if the access permissions allow writes. + pub fn allows_write(self) -> bool { + match self { + Self::RW_EL1 | Self::RW_ANY => true, + Self::RO_EL1 | Self::RO_ANY => false, + _ => false, + } + } + + /// Returns true if the access permissions allow reads (always true for + /// valid permissions). + pub fn allows_read(self) -> bool { + true + } +} + +open_enum! { + /// Shareability field values. + pub enum Shareability: u8 { + /// Non-shareable. + NON_SHAREABLE = 0b00, + /// Outer shareable. + OUTER_SHAREABLE = 0b10, + /// Inner shareable. + INNER_SHAREABLE = 0b11, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pt_desc_invalid() { + let desc = PtDesc::from(0u64); + assert!(!desc.is_valid()); + assert!(!desc.is_table()); + assert!(!desc.is_block()); + } + + #[test] + fn test_pt_desc_table() { + // Valid=1, Type=1 → table descriptor + let desc = PtDesc::new().with_valid(true).with_desc_type(true); + assert!(desc.is_valid()); + assert!(desc.is_table()); + assert!(!desc.is_block()); + } + + #[test] + fn test_pt_desc_block() { + // Valid=1, Type=0 → block descriptor + let desc = PtDesc::new().with_valid(true).with_desc_type(false); + assert!(desc.is_valid()); + assert!(!desc.is_table()); + assert!(desc.is_block()); + } + + #[test] + fn test_pt_desc_page_at_l3() { + // At L3: Valid=1, Type=1 → page descriptor + let desc = PtDesc::new().with_valid(true).with_desc_type(true); + assert!(desc.is_page_at_l3()); + } + + #[test] + fn test_pt_desc_4k_page_address() { + // 4K page at L3: output address at bits [47:12] + let page_addr: u64 = 0x4000_1000; + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(page_addr >> 12); + + assert_eq!(desc.output_address_4k(3), page_addr); + } + + #[test] + fn test_pt_desc_4k_l2_block_address() { + // 2MB block at L2: output address at bits [47:21] + let block_addr: u64 = 0x4020_0000; // 2MB aligned + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(false) + .with_addr_bits(block_addr >> 12); + + assert_eq!(desc.output_address_4k(2), block_addr); + } + + #[test] + fn test_pt_desc_4k_l1_block_address() { + // 1GB block at L1: output address at bits [47:30] + let block_addr: u64 = 0x4000_0000; // 1GB aligned + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(false) + .with_addr_bits(block_addr >> 12); + + assert_eq!(desc.output_address_4k(1), block_addr); + } + + #[test] + fn test_pt_desc_table_next_addr() { + let table_addr: u64 = 0x8000_5000; + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(table_addr >> 12); + + assert_eq!(desc.next_table_addr(), table_addr); + } + + #[test] + fn test_pt_desc_access_flag() { + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_af(true); + assert!(desc.af()); + + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_af(false); + assert!(!desc.af()); + } + + #[test] + fn test_pt_desc_permissions() { + // RW_EL1 + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_ap(ApBits::RW_EL1.0); + assert_eq!(desc.ap(), ApBits::RW_EL1.0); + + // RO_EL1 + let desc = desc.with_ap(ApBits::RO_EL1.0); + assert_eq!(desc.ap(), ApBits::RO_EL1.0); + } + + #[test] + fn test_ap_bits_write_permission() { + assert!(ApBits::RW_EL1.allows_write()); + assert!(ApBits::RW_ANY.allows_write()); + assert!(!ApBits::RO_EL1.allows_write()); + assert!(!ApBits::RO_ANY.allows_write()); + } + + #[test] + fn test_ap_bits_read_permission() { + // All valid AP values allow reads + assert!(ApBits::RW_EL1.allows_read()); + assert!(ApBits::RW_ANY.allows_read()); + assert!(ApBits::RO_EL1.allows_read()); + assert!(ApBits::RO_ANY.allows_read()); + } + + #[test] + fn test_pt_desc_full_roundtrip() { + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_attr_index(3) + .with_ns(true) + .with_ap(ApBits::RO_ANY.0) + .with_sh(Shareability::INNER_SHAREABLE.0) + .with_af(true) + .with_ng(true) + .with_addr_bits(0x1234_5000_u64 >> 12) + .with_pxn(true) + .with_uxn(true); + + assert!(desc.valid()); + assert!(desc.desc_type()); + assert_eq!(desc.attr_index(), 3); + assert!(desc.ns()); + assert_eq!(desc.ap(), ApBits::RO_ANY.0); + assert_eq!(desc.sh(), Shareability::INNER_SHAREABLE.0); + assert!(desc.af()); + assert!(desc.ng()); + assert_eq!(desc.next_table_addr(), 0x1234_5000); + assert!(desc.pxn()); + assert!(desc.uxn()); + } + + #[test] + fn test_pt_desc_preserves_page_offset() { + // Verify that the output address does not include sub-page bits + let page_addr: u64 = 0x8000_3000; + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_addr_bits(page_addr >> 12); + + // At L3, the output is the page base + assert_eq!(desc.output_address_4k(3), page_addr); + assert_eq!(desc.output_address_4k(3) & 0xFFF, 0); + } + + #[test] + fn test_shareability_values() { + assert_eq!(Shareability::NON_SHAREABLE.0, 0b00); + assert_eq!(Shareability::OUTER_SHAREABLE.0, 0b10); + assert_eq!(Shareability::INNER_SHAREABLE.0, 0b11); + } +} diff --git a/vm/devices/iommu/smmu/src/spec/registers.rs b/vm/devices/iommu/smmu/src/spec/registers.rs new file mode 100644 index 0000000000..42b2f90e1e --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/registers.rs @@ -0,0 +1,708 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 MMIO register definitions. +//! +//! Register offsets and bitfield types from the Arm SMMUv3 architecture +//! specification (IHI 0070), Chapter 6. + +use bitfield_struct::bitfield; +use inspect::Inspect; +use open_enum::open_enum; + +// ============================================================================= +// MMIO Register Offsets — Page 0 (base + 0x00000) +// ============================================================================= + +/// SMMU_IDR0: Feature identification register. +pub const IDR0: u16 = 0x0000; +/// SMMU_IDR1: Queue and stream size identification. +pub const IDR1: u16 = 0x0004; +/// SMMU_IDR2: Extended feature identification. +pub const IDR2: u16 = 0x0008; +/// SMMU_IDR3: Extended feature identification. +pub const IDR3: u16 = 0x000C; +/// SMMU_IDR4: Implementation-defined identification. +pub const IDR4: u16 = 0x0010; +/// SMMU_IDR5: Granule and output address size. +pub const IDR5: u16 = 0x0014; +/// SMMU_IIDR: Implementer identification. +pub const IIDR: u16 = 0x0018; +/// SMMU_AIDR: Architecture version identification. +pub const AIDR: u16 = 0x001C; + +/// SMMU_CR0: Control register. +pub const CR0: u16 = 0x0020; +/// SMMU_CR0ACK: CR0 acknowledgment (read-only). +pub const CR0ACK: u16 = 0x0024; +/// SMMU_CR1: Queue/table access attributes. +pub const CR1: u16 = 0x0028; +/// SMMU_CR2: Extended controls. +pub const CR2: u16 = 0x002C; + +/// SMMU_STATUSR: Status register. +pub const STATUSR: u16 = 0x0040; +/// SMMU_GBPA: Global bypass attributes. +pub const GBPA: u16 = 0x0044; +/// SMMU_AGBPA: Alternate global bypass attributes. +pub const AGBPA: u16 = 0x0048; + +/// SMMU_IRQ_CTRL: Interrupt enable register. +pub const IRQ_CTRL: u16 = 0x0050; +/// SMMU_IRQ_CTRLACK: IRQ_CTRL acknowledgment (read-only). +pub const IRQ_CTRLACK: u16 = 0x0054; + +/// SMMU_GERROR: Global error status (read-only, toggle protocol). +pub const GERROR: u16 = 0x0060; +/// SMMU_GERRORN: Global error acknowledgment. +pub const GERRORN: u16 = 0x0064; + +/// SMMU_GERROR_IRQ_CFG0: GERROR MSI address (64-bit). +pub const GERROR_IRQ_CFG0: u16 = 0x0068; +/// SMMU_GERROR_IRQ_CFG1: GERROR MSI data payload. +pub const GERROR_IRQ_CFG1: u16 = 0x0070; +/// SMMU_GERROR_IRQ_CFG2: GERROR MSI attributes. +pub const GERROR_IRQ_CFG2: u16 = 0x0074; + +/// SMMU_STRTAB_BASE: Stream table base address (64-bit). +pub const STRTAB_BASE: u16 = 0x0080; +/// SMMU_STRTAB_BASE_CFG: Stream table configuration. +pub const STRTAB_BASE_CFG: u16 = 0x0088; + +/// SMMU_CMDQ_BASE: Command queue base address (64-bit). +pub const CMDQ_BASE: u16 = 0x0090; +/// SMMU_CMDQ_PROD: Command queue producer index. +pub const CMDQ_PROD: u16 = 0x0098; +/// SMMU_CMDQ_CONS: Command queue consumer index. +pub const CMDQ_CONS: u16 = 0x009C; + +/// SMMU_EVENTQ_BASE: Event queue base address (64-bit). +pub const EVENTQ_BASE: u16 = 0x00A0; + +/// SMMU_EVENTQ_IRQ_CFG0: Event queue MSI address (64-bit). +pub const EVENTQ_IRQ_CFG0: u16 = 0x00B0; +/// SMMU_EVENTQ_IRQ_CFG1: Event queue MSI data. +pub const EVENTQ_IRQ_CFG1: u16 = 0x00B8; +/// SMMU_EVENTQ_IRQ_CFG2: Event queue MSI attributes. +pub const EVENTQ_IRQ_CFG2: u16 = 0x00BC; + +// ============================================================================= +// MMIO Register Offsets — Page 1 (base + 0x10000) +// ============================================================================= + +/// SMMU_EVENTQ_PROD: Event queue producer index (page 1). +pub const EVENTQ_PROD_PAGE1: u32 = 0x100A8; +/// SMMU_EVENTQ_CONS: Event queue consumer index (page 1). +pub const EVENTQ_CONS_PAGE1: u32 = 0x100AC; + +/// SMMU_CMDQ_IRQ_CFG0: Command queue MSI address (page 1, 64-bit). +pub const CMDQ_IRQ_CFG0_PAGE1: u32 = 0x10008; +/// SMMU_CMDQ_IRQ_CFG1: Command queue MSI data (page 1). +pub const CMDQ_IRQ_CFG1_PAGE1: u32 = 0x10010; +/// SMMU_CMDQ_IRQ_CFG2: Command queue MSI attributes (page 1). +pub const CMDQ_IRQ_CFG2_PAGE1: u32 = 0x10014; + +/// Total MMIO region size: page 0 (64KB) + page 1 (64KB) = 128KB. +pub const MMIO_REGION_SIZE: u64 = 0x20000; + +// ============================================================================= +// Bitfield Types — Identification Registers +// ============================================================================= + +/// SMMU_IDR0: Feature identification. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Idr0 { + /// Stage 2 translation supported. + pub s2p: bool, + /// Stage 1 translation supported. + pub s1p: bool, + /// Translation table format. + #[bits(2)] + pub ttf: u8, + /// Coherent access supported. + pub cohacc: bool, + /// Broadcast TLB maintenance. + pub btm: bool, + /// Hardware translation table update. + #[bits(2)] + pub httu: u8, + /// Dormant hint. + pub dormhint: bool, + /// Hypervisor stage. + pub hyp: bool, + /// ATS supported. + pub ats: bool, + /// NS1ATS. + pub ns1ats: bool, + /// 16-bit ASID supported. + pub asid16: bool, + /// MSI supported. + pub msi: bool, + /// SEV supported. + pub sev: bool, + /// ATOS supported. + pub atos: bool, + /// PRI supported. + pub pri: bool, + /// VMID wildcard. + pub vmw: bool, + /// 16-bit VMID supported. + pub vmid16: bool, + /// 2-level CD table supported. + pub cd2l: bool, + /// Virtual ATOS. + pub vatos: bool, + /// Translation table endianness. + #[bits(2)] + pub ttendian: u8, + /// ATS recording error. + pub atsrecerr: bool, + /// Stall model. + #[bits(2)] + pub stall_model: u8, + /// Terminate model. + pub term_model: bool, + /// Stream table level. + #[bits(2)] + pub st_level: u8, + #[bits(1)] + _reserved: u32, + /// RME implementation. + pub rme_impl: bool, + #[bits(1)] + _reserved2: u32, +} + +/// SMMU_IDR1: Queue and stream size identification. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Idr1 { + /// StreamID size (number of bits). + #[bits(6)] + pub sidsize: u8, + /// SubstreamID size (number of bits). + #[bits(5)] + pub ssidsize: u8, + /// Reserved. + #[bits(5)] + _reserved0: u32, + /// Max event queue size as log2(entries). + #[bits(5)] + pub eventqs: u8, + /// Max command queue size as log2(entries). + #[bits(5)] + pub cmdqs: u8, + /// Attribute permissions override. + pub attr_perms_ovr: bool, + /// Attribute types override. + pub attr_types_ovr: bool, + /// REL (relative base pointers). + pub rel: bool, + /// Queues preset. + pub queues_preset: bool, + /// Tables preset. + pub tables_preset: bool, + /// Enhanced CMDQ. + pub ecmdq: bool, +} + +/// SMMU_IDR5: Granule and output address size. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Idr5 { + /// Output address size. + #[bits(3)] + pub oas: u8, + #[bits(1)] + _reserved0: u32, + /// 4KB granule supported. + pub gran4k: bool, + /// 16KB granule supported. + pub gran16k: bool, + /// 64KB granule supported. + pub gran64k: bool, + /// Double-size support. + pub ds: bool, + /// 128-bit descriptors. + pub d128: bool, + #[bits(1)] + _reserved1: u32, + /// VA extension (48 or 52 bit). + #[bits(2)] + pub vax: u8, + #[bits(4)] + _reserved2: u32, + /// Max stall entries. + #[bits(16)] + pub stall_max: u16, +} + +// ============================================================================= +// Bitfield Types — Control Registers +// ============================================================================= + +/// SMMU_CR0: Control register. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Cr0 { + /// SMMU enable. + pub smmuen: bool, + /// PRI queue enable. + pub priqen: bool, + /// Event queue enable. + pub eventqen: bool, + /// Command queue enable. + pub cmdqen: bool, + /// ATS check enable. + pub atschk: bool, + #[bits(1)] + _reserved0: u32, + /// VMW override. + #[bits(3)] + pub vmw: u8, + #[bits(1)] + _reserved1: u32, + /// DPT walk enable. + pub dpt_walk_en: bool, + /// VSID enable. + pub vsiden: bool, + #[bits(20)] + _reserved2: u32, +} + +/// SMMU_CR1: Queue/table access attributes. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Cr1 { + /// Queue inner cacheability. + #[bits(2)] + pub queue_ic: u8, + /// Queue outer cacheability. + #[bits(2)] + pub queue_oc: u8, + /// Queue shareability. + #[bits(2)] + pub queue_sh: u8, + /// Table inner cacheability. + #[bits(2)] + pub table_ic: u8, + /// Table outer cacheability. + #[bits(2)] + pub table_oc: u8, + /// Table shareability. + #[bits(2)] + pub table_sh: u8, + #[bits(20)] + _reserved: u32, +} + +/// SMMU_CR2: Extended controls. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Cr2 { + /// Require private translation. + pub recinvsid: bool, + /// E2H enable. + pub e2h: bool, + /// PTM enable. + pub ptm: bool, + #[bits(29)] + _reserved: u32, +} + +/// SMMU_GBPA: Global bypass attributes. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Gbpa { + #[bits(1)] + _reserved0: u32, + /// Abort all incoming transactions. + pub abort: bool, + #[bits(3)] + _reserved1: u32, + /// Instruction/data type override. + #[bits(2)] + pub instcfg: u8, + /// Privilege override. + #[bits(2)] + pub privcfg: u8, + #[bits(3)] + _reserved2: u32, + /// Shareability configuration. + #[bits(2)] + pub shcfg: u8, + /// Memory type config. + #[bits(4)] + pub alloccfg: u8, + #[bits(13)] + _reserved3: u32, + /// Update in progress (cleared by SMMU on completion). + pub update: bool, +} + +/// SMMU_IRQ_CTRL: Interrupt enable control. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct IrqCtrl { + /// Global error IRQ enable. + pub gerror_irqen: bool, + /// PRI queue IRQ enable. + pub priq_irqen: bool, + /// Event queue IRQ enable. + pub eventq_irqen: bool, + #[bits(29)] + _reserved: u32, +} + +/// SMMU_GERROR / SMMU_GERRORN: Global error status bits. +/// +/// An error is active when `GERROR[bit] != GERRORN[bit]`. The SMMU toggles +/// GERROR to signal; software toggles GERRORN to acknowledge. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct Gerror { + /// Command queue error. + pub cmdq_err: bool, + #[bits(1)] + _reserved0: u32, + /// Event queue access aborted. + pub eventq_abt_err: bool, + /// PRI queue access aborted. + pub priq_abt_err: bool, + /// CMD_SYNC MSI aborted. + pub msi_cmdq_abt_err: bool, + /// EVTQ MSI aborted. + pub msi_eventq_abt_err: bool, + /// PRIQ MSI aborted. + pub msi_priq_abt_err: bool, + /// GERROR MSI aborted. + pub msi_gerror_abt_err: bool, + /// Service failure mode. + pub sfm_err: bool, + #[bits(23)] + _reserved1: u32, +} + +// ============================================================================= +// Bitfield Types — Queue Base Registers +// ============================================================================= + +/// SMMU_STRTAB_BASE: Stream table base address. +#[bitfield(u64)] +#[derive(PartialEq, Eq, Inspect)] +pub struct StrtabBase { + #[bits(6)] + _reserved0: u64, + /// Physical address of the stream table, bits `[55:6]`. + #[bits(50)] + pub addr_bits: u64, + #[bits(6)] + _reserved1: u64, + /// Read-allocate hint. + pub ra: bool, + #[bits(1)] + _reserved2: u64, +} + +impl StrtabBase { + /// Returns the physical address of the stream table. + pub fn addr(&self) -> u64 { + self.addr_bits() << 6 + } +} + +/// SMMU_STRTAB_BASE_CFG: Stream table configuration. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct StrtabBaseCfg { + /// Table size as log2(entries). + #[bits(6)] + pub log2size: u8, + /// Split point for 2-level tables (ignored for linear). + #[bits(5)] + pub split: u8, + #[bits(5)] + _reserved: u32, + /// Stream table format: 0=linear, 1=2-level. + #[bits(2)] + pub fmt: u8, + #[bits(14)] + _reserved2: u32, +} + +open_enum! { + /// Stream table format values for `StrtabBaseCfg.fmt`. + pub enum StrtabFmt: u8 { + /// Linear stream table. + LINEAR = 0, + /// 2-level stream table. + TWO_LEVEL = 1, + } +} + +/// SMMU_CMDQ_BASE / SMMU_EVENTQ_BASE: Queue base address. +#[bitfield(u64)] +#[derive(PartialEq, Eq, Inspect)] +pub struct QueueBase { + /// Queue size as log2(entries). + #[bits(5)] + pub log2size: u8, + /// Physical address of queue memory, bits `[55:5]`. + #[bits(51)] + pub addr_bits: u64, + #[bits(6)] + _reserved: u64, + /// Read/write allocate hint. + pub ra_wa: bool, + #[bits(1)] + _reserved2: u64, +} + +impl QueueBase { + /// Returns the physical address of the queue. + pub fn addr(&self) -> u64 { + self.addr_bits() << 5 + } +} + +/// SMMU_CMDQ_CONS: Command queue consumer index. +/// +/// Has an error field in the upper bits that indicates the reason for a +/// command queue error. +#[bitfield(u32)] +#[derive(PartialEq, Eq, Inspect)] +pub struct CmdqCons { + /// Read index with wrap bit (bits `[19:0]`). + #[bits(20)] + pub rd: u32, + #[bits(4)] + _reserved: u32, + /// Error code (valid when GERROR.CMDQ_ERR is active). + #[bits(7)] + pub err: u8, + #[bits(1)] + _reserved2: u32, +} + +open_enum! { + /// Command queue error codes for `CmdqCons.err`. + pub enum CmdqError: u8 { + /// No error. + CERROR_NONE = 0, + /// Illegal command. + CERROR_ILL = 1, + /// Command queue abort. + CERROR_ABT = 2, + /// ATS error. + CERROR_ATS_ERR = 3, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_idr0_bitfield_roundtrip() { + let idr0 = Idr0::new() + .with_s1p(true) + .with_ttf(0b10) + .with_cohacc(true) + .with_asid16(true) + .with_msi(true) + .with_ttendian(0b10) + .with_stall_model(0b01) + .with_term_model(true); + assert!(idr0.s1p()); + assert_eq!(idr0.ttf(), 0b10); + assert!(idr0.cohacc()); + assert!(idr0.asid16()); + assert!(idr0.msi()); + assert_eq!(idr0.ttendian(), 0b10); + assert!(!idr0.s2p()); + assert!(!idr0.ats()); + assert!(!idr0.pri()); + } + + #[test] + fn test_idr0_recommended_value() { + // The recommended IDR0 value from the spec reference: + // S1P=1, TTF=0b10, COHACC=1, ASID16=1, MSI=1, + // TTENDIAN=0b10 (LE), STALL_MODEL=0b01, TERM_MODEL=1 + // = 0x0E40_301E + let idr0 = Idr0::new() + .with_s1p(true) + .with_ttf(0b10) + .with_cohacc(true) + .with_asid16(true) + .with_msi(true) + .with_ttendian(0b10) + .with_stall_model(0b01) + .with_term_model(true); + + // Verify individual bits + assert!(idr0.s1p()); + assert!(idr0.msi()); + assert!(idr0.cohacc()); + assert!(idr0.term_model()); + } + + #[test] + fn test_idr1_bitfield_roundtrip() { + let idr1 = Idr1::new().with_sidsize(8).with_cmdqs(8).with_eventqs(8); + assert_eq!(idr1.sidsize(), 8); + assert_eq!(idr1.cmdqs(), 8); + assert_eq!(idr1.eventqs(), 8); + assert_eq!(idr1.ssidsize(), 0); + assert!(!idr1.tables_preset()); + assert!(!idr1.queues_preset()); + } + + #[test] + fn test_idr5_bitfield_roundtrip() { + let idr5 = Idr5::new() + .with_oas(0b010) + .with_gran4k(true) + .with_gran64k(true); + assert_eq!(idr5.oas(), 0b010); + assert!(idr5.gran4k()); + assert!(idr5.gran64k()); + assert!(!idr5.gran16k()); + } + + #[test] + fn test_cr0_bitfield_roundtrip() { + let cr0 = Cr0::new() + .with_smmuen(true) + .with_cmdqen(true) + .with_eventqen(true); + assert!(cr0.smmuen()); + assert!(cr0.cmdqen()); + assert!(cr0.eventqen()); + assert!(!cr0.priqen()); + } + + #[test] + fn test_cr0_enable_sequence() { + // Linux enables features one at a time: + // 1. CMDQEN + let cr0 = Cr0::new().with_cmdqen(true); + assert!(cr0.cmdqen()); + assert!(!cr0.eventqen()); + assert!(!cr0.smmuen()); + + // 2. CMDQEN + EVENTQEN + let cr0 = cr0.with_eventqen(true); + assert!(cr0.cmdqen()); + assert!(cr0.eventqen()); + assert!(!cr0.smmuen()); + + // 3. CMDQEN + EVENTQEN + SMMUEN + let cr0 = cr0.with_smmuen(true); + assert!(cr0.cmdqen()); + assert!(cr0.eventqen()); + assert!(cr0.smmuen()); + } + + #[test] + fn test_gbpa_update_bit() { + let gbpa = Gbpa::new().with_update(true).with_abort(true); + assert!(gbpa.update()); + assert!(gbpa.abort()); + + // Simulate SMMU clearing the update bit + let gbpa = gbpa.with_update(false); + assert!(!gbpa.update()); + assert!(gbpa.abort()); + } + + #[test] + fn test_irq_ctrl_roundtrip() { + let irq_ctrl = IrqCtrl::new() + .with_gerror_irqen(true) + .with_eventq_irqen(true); + assert!(irq_ctrl.gerror_irqen()); + assert!(irq_ctrl.eventq_irqen()); + assert!(!irq_ctrl.priq_irqen()); + } + + #[test] + fn test_gerror_toggle_protocol() { + let gerror = Gerror::new().with_cmdq_err(true); + let gerrorn = Gerror::new(); + + // Error is active when bits differ + assert_ne!(gerror.cmdq_err(), gerrorn.cmdq_err(),); + + // Software acknowledges by matching + let gerrorn = gerrorn.with_cmdq_err(true); + assert_eq!(gerror.cmdq_err(), gerrorn.cmdq_err(),); + } + + #[test] + fn test_strtab_base_address() { + // Address must be 64-byte aligned (bottom 6 bits zero) + let base = StrtabBase::new().with_addr_bits(0x1000_0000_u64 >> 6); + assert_eq!(base.addr(), 0x1000_0000); + + let base = StrtabBase::new().with_addr_bits(0x0080_0000_0000_u64 >> 6); + assert_eq!(base.addr(), 0x0080_0000_0000); + } + + #[test] + fn test_strtab_base_cfg_roundtrip() { + let cfg = StrtabBaseCfg::new() + .with_fmt(StrtabFmt::LINEAR.0) + .with_log2size(8); + assert_eq!(cfg.fmt(), StrtabFmt::LINEAR.0); + assert_eq!(cfg.log2size(), 8); + } + + #[test] + fn test_queue_base_address() { + let base = QueueBase::new() + .with_addr_bits(0x2000_0000_u64 >> 5) + .with_log2size(8); + assert_eq!(base.addr(), 0x2000_0000); + assert_eq!(base.log2size(), 8); + } + + #[test] + fn test_cmdq_cons_error() { + let cons = CmdqCons::new() + .with_rd(42) + .with_err(CmdqError::CERROR_ILL.0); + assert_eq!(cons.rd(), 42); + assert_eq!(cons.err(), CmdqError::CERROR_ILL.0); + } + + #[test] + fn test_register_offsets() { + // Verify offsets match the spec + assert_eq!(IDR0, 0x0000); + assert_eq!(IDR1, 0x0004); + assert_eq!(IDR5, 0x0014); + assert_eq!(IIDR, 0x0018); + assert_eq!(AIDR, 0x001C); + assert_eq!(CR0, 0x0020); + assert_eq!(CR0ACK, 0x0024); + assert_eq!(CR1, 0x0028); + assert_eq!(CR2, 0x002C); + assert_eq!(GBPA, 0x0044); + assert_eq!(IRQ_CTRL, 0x0050); + assert_eq!(IRQ_CTRLACK, 0x0054); + assert_eq!(GERROR, 0x0060); + assert_eq!(GERRORN, 0x0064); + assert_eq!(GERROR_IRQ_CFG0, 0x0068); + assert_eq!(STRTAB_BASE, 0x0080); + assert_eq!(STRTAB_BASE_CFG, 0x0088); + assert_eq!(CMDQ_BASE, 0x0090); + assert_eq!(CMDQ_PROD, 0x0098); + assert_eq!(CMDQ_CONS, 0x009C); + assert_eq!(EVENTQ_BASE, 0x00A0); + assert_eq!(EVENTQ_IRQ_CFG0, 0x00B0); + assert_eq!(EVENTQ_PROD_PAGE1, 0x100A8); + assert_eq!(EVENTQ_CONS_PAGE1, 0x100AC); + } +} diff --git a/vm/devices/iommu/smmu/src/spec/ste.rs b/vm/devices/iommu/smmu/src/spec/ste.rs new file mode 100644 index 0000000000..0c40340bf8 --- /dev/null +++ b/vm/devices/iommu/smmu/src/spec/ste.rs @@ -0,0 +1,309 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMUv3 Stream Table Entry (STE) definitions. +//! +//! Each STE is 64 bytes (512 bits). The STE describes how the SMMU processes +//! transactions for a given stream (device). + +use bitfield_struct::bitfield; +use open_enum::open_enum; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; + +/// Stream table entry size in bytes. +pub const STE_SIZE: usize = 64; + +/// Stream table entry (64 bytes). +/// +/// Only the first two quadwords have defined fields for stage 1 translation. +/// The remaining quadwords are used for stage 2 and other optional features. +#[repr(C)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct Ste { + /// Quadword 0: Valid, Config, S1 context pointer. + pub qw0: SteDw0, + /// Quadword 1: Stage 1 attributes, stream world. + pub qw1: SteDw1, + /// Quadwords 2-7: Stage 2 fields (unused for S1-only). + pub _qw2_7: [u64; 6], +} + +impl Ste { + /// Returns true if the STE is valid (V bit set). + pub fn valid(&self) -> bool { + self.qw0.v() + } + + /// Returns the stream configuration. + pub fn config(&self) -> SteConfig { + SteConfig(self.qw0.config()) + } + + /// Returns the stage 1 context descriptor pointer (physical address). + /// + /// The pointer is stored in bits `[55:6]` of QW0, so the actual address + /// is the stored value shifted left by 6. + pub fn s1_context_ptr(&self) -> u64 { + self.qw0.s1_context_ptr() << 6 + } + + /// Returns the S1CDMax field (log2 of number of context descriptors). + pub fn s1_cd_max(&self) -> u8 { + self.qw0.s1_cd_max() + } + + /// Returns the S1Fmt field (CD table format). + pub fn s1_fmt(&self) -> u8 { + self.qw0.s1_fmt() + } +} + +/// STE QW0 (bits `[63:0]`): Valid, Config, S1 pointers. +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct SteDw0 { + /// Valid bit. + pub v: bool, + /// Stream configuration. + #[bits(3)] + pub config: u8, + /// Stage 1 CD table format (0=linear, 1=2-level 4KB, 2=2-level 64KB). + #[bits(2)] + pub s1_fmt: u8, + /// Stage 1 context descriptor pointer, bits `[55:6]` (address >> 6). + #[bits(50)] + pub s1_context_ptr: u64, + #[bits(3)] + _reserved: u64, + /// Log2(number of CDs). 0 = single CD. + #[bits(5)] + pub s1_cd_max: u8, +} + +/// STE QW1 (bits `[127:64]`): Stage 1 attributes, stream world, etc. +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct SteDw1 { + /// S1 default substream behavior. + #[bits(2)] + pub s1_dss: u8, + /// CD pointer inner cacheability. + #[bits(2)] + pub s1_cir: u8, + /// CD pointer outer cacheability. + #[bits(2)] + pub s1_cor: u8, + /// CD pointer shareability. + #[bits(2)] + pub s1_csh: u8, + #[bits(4)] + _reserved0: u64, + /// DRE (DPCM/stall related). + pub dre: bool, + /// Contiguous hint. + pub cont: bool, + #[bits(2)] + _reserved1: u64, + /// Memory type config / MemAttr / MEV. + #[bits(5)] + pub mem_attr_and_mev: u8, + #[bits(3)] + _reserved2: u64, + /// Allocation configuration. + #[bits(4)] + pub alloccfg: u8, + /// Shareability override. + #[bits(2)] + pub shcfg: u8, + /// NS configuration. + #[bits(2)] + pub nscfg: u8, + #[bits(3)] + _reserved3: u64, + /// Stream world. + #[bits(2)] + pub strw: u8, + /// Memory type config override. + pub mtcfg: bool, + /// Memory attribute (for bypass). + #[bits(4)] + pub mem_attr: u8, + /// Instruction/data override. + #[bits(2)] + pub instcfg: u8, + /// Privilege override. + #[bits(2)] + pub privcfg: u8, + /// Software reserved fields. + #[bits(4)] + pub sw_reserved: u8, + /// EATS (ATS behavior). + #[bits(3)] + pub eats: u8, + /// S2 VMID (ignored for S2 bypass). + #[bits(11)] + pub s2_vmid: u16, +} + +open_enum! { + /// STE Config field values (bits `[3:1]` of DW0). + pub enum SteConfig: u8 { + /// Abort: all transactions are aborted. + ABORT = 0b000, + /// Bypass: S1 bypass, S2 bypass (identity mapping). + BYPASS = 0b100, + /// S1 Translate, S2 Bypass. + S1_TRANS = 0b101, + /// S1 Bypass, S2 Translate. + S2_TRANS = 0b110, + /// S1 Translate, S2 Translate. + S1S2_TRANS = 0b111, + } +} + +open_enum! { + /// STE S1Fmt (CD table format) values. + pub enum S1Fmt: u8 { + /// Linear CD table. + LINEAR = 0b00, + /// 2-level CD table, 4KB L2. + TWO_LEVEL_4K = 0b01, + /// 2-level CD table, 64KB L2. + TWO_LEVEL_64K = 0b10, + } +} + +open_enum! { + /// STE stream world values. + pub enum Strw: u8 { + /// Non-secure EL1. + NS_EL1 = 0b00, + /// Non-secure EL2. + NS_EL2 = 0b10, + /// EL2 with E2H. + EL2_E2H = 0b11, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ste_size() { + assert_eq!(size_of::(), STE_SIZE); + } + + #[test] + fn test_ste_valid() { + let ste = Ste { + qw0: SteDw0::new(), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + }; + assert!(!ste.valid()); + + let ste = Ste { + qw0: SteDw0::new().with_v(true), + ..ste + }; + assert!(ste.valid()); + } + + #[test] + fn test_ste_config_values() { + assert_eq!(SteConfig::ABORT.0, 0b000); + assert_eq!(SteConfig::BYPASS.0, 0b100); + assert_eq!(SteConfig::S1_TRANS.0, 0b101); + assert_eq!(SteConfig::S2_TRANS.0, 0b110); + assert_eq!(SteConfig::S1S2_TRANS.0, 0b111); + } + + #[test] + fn test_ste_dw0_fields() { + let dw0 = SteDw0::new() + .with_v(true) + .with_config(SteConfig::S1_TRANS.0) + .with_s1_fmt(S1Fmt::LINEAR.0) + .with_s1_context_ptr(0x1000_0000_u64 >> 6) + .with_s1_cd_max(0); + + assert!(dw0.v()); + assert_eq!(dw0.config(), SteConfig::S1_TRANS.0); + assert_eq!(dw0.s1_fmt(), S1Fmt::LINEAR.0); + assert_eq!(dw0.s1_context_ptr() << 6, 0x1000_0000); + assert_eq!(dw0.s1_cd_max(), 0); + } + + #[test] + fn test_ste_dw1_fields() { + let dw1 = SteDw1::new() + .with_s1_cir(0b01) // WB + .with_s1_cor(0b01) // WB + .with_s1_csh(0b11) // ISH + .with_strw(Strw::NS_EL1.0); + + assert_eq!(dw1.s1_cir(), 0b01); + assert_eq!(dw1.s1_cor(), 0b01); + assert_eq!(dw1.s1_csh(), 0b11); + assert_eq!(dw1.strw(), Strw::NS_EL1.0); + } + + #[test] + fn test_ste_bypass() { + let ste = Ste { + qw0: SteDw0::new().with_v(true).with_config(SteConfig::BYPASS.0), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + }; + + assert!(ste.valid()); + assert_eq!(ste.config(), SteConfig::BYPASS); + } + + #[test] + fn test_ste_s1_trans() { + let cd_addr: u64 = 0x8000_0000; + let ste = Ste { + qw0: SteDw0::new() + .with_v(true) + .with_config(SteConfig::S1_TRANS.0) + .with_s1_fmt(S1Fmt::LINEAR.0) + .with_s1_context_ptr(cd_addr >> 6) + .with_s1_cd_max(0), + qw1: SteDw1::new() + .with_s1_cir(0b01) + .with_s1_cor(0b01) + .with_s1_csh(0b11) + .with_strw(Strw::NS_EL1.0), + _qw2_7: [0; 6], + }; + + assert!(ste.valid()); + assert_eq!(ste.config(), SteConfig::S1_TRANS); + assert_eq!(ste.s1_context_ptr(), cd_addr); + assert_eq!(ste.s1_cd_max(), 0); + assert_eq!(ste.s1_fmt(), S1Fmt::LINEAR.0); + } + + #[test] + fn test_ste_invalid_returns_fault() { + let ste = Ste { + qw0: SteDw0::new(), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + }; + assert!(!ste.valid()); + } + + #[test] + fn test_ste_context_ptr_alignment() { + // Context pointer is 64-byte aligned (bits [55:6]) + let dw0 = SteDw0::new().with_s1_context_ptr(0xABCD_EF00_u64 >> 6); + // Reconstructed address should be 64-byte aligned + assert_eq!((dw0.s1_context_ptr() << 6) & 0x3F, 0); + } +} diff --git a/vm/devices/iommu/smmu/src/translate.rs b/vm/devices/iommu/smmu/src/translate.rs new file mode 100644 index 0000000000..dcbfcbc701 --- /dev/null +++ b/vm/devices/iommu/smmu/src/translate.rs @@ -0,0 +1,1046 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMU translation logic: STE lookup, CD lookup, and translation context. +//! +//! This module handles the IOVA→GPA translation chain: +//! 1. Look up the Stream Table Entry (STE) by stream ID. +//! 2. Determine the translation action from STE.Config. +//! 3. For S1 translation, look up the Context Descriptor (CD). +//! 4. Extract the translation context (page table base, granule, etc.). + +use crate::spec::cd::Cd; +use crate::spec::cd::Tg0; +use crate::spec::events::EventId; +use crate::spec::events::EvtEntry; +use crate::spec::pt::ApBits; +use crate::spec::pt::PtDesc; +use crate::spec::ste::STE_SIZE; +use crate::spec::ste::Ste; +use crate::spec::ste::SteConfig; +use guestmem::GuestMemory; + +/// Result of an STE config dispatch. +#[derive(Debug, PartialEq, Eq)] +pub enum SteAction { + /// Abort all transactions for this stream. + Abort, + /// Bypass translation (identity IOVA=GPA). + Bypass, + /// Stage 1 translation — proceed to CD lookup. + S1Translate, +} + +/// Parameters for walking an AArch64 stage 1 page table, extracted from +/// STE + CD. +#[derive(Debug, Clone)] +pub struct TranslationContext { + /// Page table base address (physical/GPA) from CD.TTB0. + pub ttb0: u64, + /// Input address size: VA range = 2^(64 - t0sz). + pub t0sz: u8, + /// Granule size (4K, 16K, or 64K). + pub tg0: Tg0, + /// Output address size in bits. + pub oas_bits: u8, + /// MAIR0 value (for attribute interpretation — not needed for address + /// translation yet, but will be used for TLB and memory attribute + /// emulation). + pub _mair0: u64, + /// ASID (for TLB tagging — will be used when a software TLB is added). + pub _asid: u16, +} + +/// Error from STE/CD lookup. +#[derive(Debug)] +pub struct SmmuFault { + /// The event to write to the EVTQ. + pub event: EvtEntry, +} + +impl SmmuFault { + fn bad_ste(sid: u32) -> Self { + SmmuFault { + event: EvtEntry::bad_ste(sid), + } + } + + fn bad_streamid(sid: u32) -> Self { + SmmuFault { + event: EvtEntry { + header: crate::spec::events::EvtHeader::new() + .with_event_id(EventId::C_BAD_STREAMID.0), + sid, + ..EvtEntry::new() + }, + } + } + + fn bad_cd(sid: u32) -> Self { + SmmuFault { + event: EvtEntry::bad_cd(sid), + } + } +} + +/// Look up the STE for a given stream ID. +/// +/// `strtab_base` is the physical base address of the linear stream table. +/// `strtab_log2size` is the log2 of the number of entries. +/// Returns the parsed STE or a fault event. +pub fn lookup_ste( + gm: &GuestMemory, + strtab_base: u64, + strtab_log2size: u8, + sid: u32, +) -> Result { + // Check stream ID is in range. + let max_sid = 1u64 << strtab_log2size; + if (sid as u64) >= max_sid { + return Err(SmmuFault::bad_streamid(sid)); + } + + let ste_addr = strtab_base + (sid as u64) * (STE_SIZE as u64); + let ste: Ste = gm + .read_plain(ste_addr) + .map_err(|_| SmmuFault::bad_ste(sid))?; + + if !ste.valid() { + return Err(SmmuFault::bad_ste(sid)); + } + + Ok(ste) +} + +/// Determine the translation action from an STE's Config field. +pub fn ste_config_action(ste: &Ste) -> Result { + match ste.config() { + SteConfig::ABORT => Ok(SteAction::Abort), + SteConfig::BYPASS => Ok(SteAction::Bypass), + SteConfig::S1_TRANS => Ok(SteAction::S1Translate), + other => Err(other), + } +} + +/// Look up the context descriptor for a given STE. +/// +/// `ssid` is the sub-stream ID (0 for single-CD setups). +/// Returns the parsed CD or a fault event. +pub fn lookup_cd(gm: &GuestMemory, ste: &Ste, sid: u32, ssid: u32) -> Result { + let s1_context_ptr = ste.s1_context_ptr(); + let s1_cd_max = ste.s1_cd_max(); + + // Validate SSID is within range. + if s1_cd_max > 0 { + let max_ssid = 1u32 << s1_cd_max; + if ssid >= max_ssid { + return Err(SmmuFault::bad_cd(sid)); + } + } else if ssid != 0 { + return Err(SmmuFault::bad_cd(sid)); + } + + let cd_addr = s1_context_ptr + (ssid as u64) * (crate::spec::cd::CD_SIZE as u64); + let cd: Cd = gm.read_plain(cd_addr).map_err(|_| SmmuFault::bad_cd(sid))?; + + if !cd.valid() { + return Err(SmmuFault::bad_cd(sid)); + } + + // Only AArch64 page tables are supported. + if !cd.aa64() { + return Err(SmmuFault::bad_cd(sid)); + } + + Ok(cd) +} + +/// Extract the translation context from a parsed CD. +/// +/// Returns `Err` with a `SmmuFault` if the CD contains unsupported or +/// invalid configuration (e.g., unrecognized granule or IPS encoding). +pub fn translation_context(cd: &Cd, sid: u32) -> Result { + let tg0 = cd.tg0(); + let ips = cd.ips(); + + // Validate granule. + if tg0.granule_size().is_none() { + return Err(SmmuFault::bad_cd(sid)); + } + + // Validate IPS. + let oas_bits = ips.bits().ok_or_else(|| SmmuFault::bad_cd(sid))?; + + let t0sz = cd.t0sz(); + if t0sz > 48 { + return Err(SmmuFault::bad_cd(sid)); + } + + // EPD0=1 means TTB0 walks are disabled — all accesses fault. + if cd.epd0() { + return Err(SmmuFault::bad_cd(sid)); + } + + Ok(TranslationContext { + ttb0: cd.ttb0(), + t0sz, + tg0, + oas_bits, + _mair0: cd.mair0, + _asid: cd.asid(), + }) +} + +/// Result of a successful page table walk. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Translation { + /// Translated guest physical address (with page offset applied). + pub gpa: u64, + /// Page size of the mapping (granule for pages, block size for blocks). + pub page_size: u64, +} + +/// Compute the start level and number of VA bits for a given granule and T0SZ. +/// +/// Returns `(start_level, va_bits)` where `va_bits = 64 - t0sz`. +fn compute_start_level(tg0: Tg0, t0sz: u8) -> Option<(u8, u8)> { + let va_bits = 64u8.checked_sub(t0sz)?; + let bits_per_level = tg0.bits_per_level()?; + let page_shift = tg0.page_shift()?; + + // Number of address bits resolved by the page table walk (excluding page + // offset). For 4K/9 bits per level: va_bits - 12 bits are resolved by + // the walk. + let resolve_bits = va_bits.checked_sub(page_shift)?; + + // Number of full levels needed = ceil(resolve_bits / bits_per_level). + // Start level = 4 - num_levels (levels are numbered 0..3). + // num_levels == 0 means the VA space is exactly one page (no walk + // needed), which is an invalid configuration. + let num_levels = resolve_bits.div_ceil(bits_per_level); + if num_levels == 0 || num_levels > 4 { + return None; + } + let start_level = 4 - num_levels; + + Some((start_level, va_bits)) +} + +/// Walk AArch64 stage 1 translation tables to translate an IOVA to a GPA. +/// +/// `gm` is the guest memory (for reading page table entries from guest RAM). +/// `ctx` holds the page table root and configuration (from STE+CD). +/// `iova` is the input virtual address to translate. +/// `write` is true for write accesses (for permission checking). +/// `sid` is the stream ID (for fault event construction). +/// +/// Returns the translated GPA and page size, or an `SmmuFault` with the +/// event to report. +pub fn walk_s1( + gm: &GuestMemory, + ctx: &TranslationContext, + iova: u64, + write: bool, + sid: u32, +) -> Result { + let tg0 = ctx.tg0; + let page_shift = tg0.page_shift().ok_or_else(|| SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + })?; + let bits_per_level = tg0.bits_per_level().ok_or_else(|| SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + })?; + let page_size = 1u64 << page_shift; + + let (start_level, va_bits) = compute_start_level(tg0, ctx.t0sz).ok_or_else(|| SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + })?; + + // Check IOVA is within the valid range (2^va_bits). + let va_mask = if va_bits >= 64 { + u64::MAX + } else { + (1u64 << va_bits) - 1 + }; + if iova > va_mask { + return Err(SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + }); + } + + let oas_mask = if ctx.oas_bits >= 64 { + u64::MAX + } else { + (1u64 << ctx.oas_bits) - 1 + }; + + let mut table_addr = ctx.ttb0; + let mut level = start_level; + + loop { + // Compute the index at this level. + // For level `l` with 4K granule (9 bits/level, 12-bit page offset): + // Level 0: bits [47:39] (9 bits) + // Level 1: bits [38:30] (9 bits) + // Level 2: bits [29:21] (9 bits) + // Level 3: bits [20:12] (9 bits) + // General formula: shift = page_shift + (3 - level) * bits_per_level + let shift = page_shift as u32 + (3 - level as u32) * bits_per_level as u32; + let index_mask = (1u64 << bits_per_level) - 1; + + // For the start level, the number of index bits may be smaller than + // bits_per_level when va_bits is not a multiple of bits_per_level. + let index = (iova >> shift) & index_mask; + + let desc_addr = table_addr + index * 8; + let desc: PtDesc = gm.read_plain(desc_addr).map_err(|_| SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + })?; + + if !desc.is_valid() { + return Err(SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + }); + } + + if level == 3 { + // At level 3, type=1 means page, type=0 is reserved (fault). + if !desc.desc_type() { + return Err(SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + }); + } + // Page descriptor at L3. + check_permissions(&desc, iova, write, sid)?; + let output_addr = output_address(&desc, tg0, level); + if output_addr > oas_mask { + return Err(SmmuFault { + event: EvtEntry::addr_size_fault(sid, iova, write), + }); + } + let page_offset = iova & (page_size - 1); + return Ok(Translation { + gpa: output_addr | page_offset, + page_size, + }); + } + + if desc.is_block() { + // Block descriptor at level 1 or 2. + check_permissions(&desc, iova, write, sid)?; + let block_size = 1u64 << shift; + let output_addr = output_address(&desc, tg0, level); + if output_addr > oas_mask { + return Err(SmmuFault { + event: EvtEntry::addr_size_fault(sid, iova, write), + }); + } + let block_offset = iova & (block_size - 1); + return Ok(Translation { + gpa: output_addr | block_offset, + page_size: block_size, + }); + } + + // Table descriptor — descend to next level. + table_addr = desc.next_table_addr(); + level += 1; + + if level > 3 { + // Should not happen with well-formed page tables. + return Err(SmmuFault { + event: EvtEntry::translation_fault(sid, iova, write), + }); + } + } +} + +/// Check access permissions and access flag on a leaf descriptor. +fn check_permissions(desc: &PtDesc, iova: u64, write: bool, sid: u32) -> Result<(), SmmuFault> { + // Check access flag. + if !desc.af() { + return Err(SmmuFault { + event: EvtEntry::access_fault(sid, iova, write), + }); + } + + // Check write permission. + if write { + let ap = ApBits(desc.ap()); + if !ap.allows_write() { + return Err(SmmuFault { + event: EvtEntry::permission_fault(sid, iova, write), + }); + } + } + + Ok(()) +} + +/// Extract the output address from a leaf descriptor, masking to the +/// appropriate alignment for the given level and granule. +fn output_address(desc: &PtDesc, tg0: Tg0, level: u8) -> u64 { + match tg0 { + Tg0::GRAN_4K => desc.output_address_4k(level), + Tg0::GRAN_16K => desc.output_address_16k(level), + Tg0::GRAN_64K => desc.output_address_64k(level), + _ => desc.output_address_4k(level), // fallback, shouldn't happen + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::spec::cd::CD_SIZE; + use crate::spec::cd::CdDw0; + use crate::spec::cd::CdDw1; + use crate::spec::cd::Ips; + use crate::spec::ste::SteDw0; + use crate::spec::ste::SteDw1; + + const STRTAB_BASE: u64 = 0x10_0000; + const CD_BASE: u64 = 0x20_0000; + const STRTAB_LOG2SIZE: u8 = 10; // 1024 entries + + /// Build a valid STE for S1 translation pointing to a CD table. + fn make_s1_ste(cd_base: u64) -> Ste { + Ste { + qw0: SteDw0::new() + .with_v(true) + .with_config(SteConfig::S1_TRANS.0) + .with_s1_context_ptr(cd_base >> 6) + .with_s1_cd_max(0), // single CD + qw1: SteDw1::new(), + _qw2_7: [0; 6], + } + } + + /// Build a valid STE for bypass. + fn make_bypass_ste() -> Ste { + Ste { + qw0: SteDw0::new().with_v(true).with_config(SteConfig::BYPASS.0), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + } + } + + /// Build a valid STE for abort. + fn make_abort_ste() -> Ste { + Ste { + qw0: SteDw0::new().with_v(true).with_config(SteConfig::ABORT.0), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + } + } + + /// Build a valid CD. + fn make_cd(ttb0: u64, t0sz: u8, tg0: Tg0, ips: Ips) -> Cd { + Cd { + qw0: CdDw0::new() + .with_v(true) + .with_t0sz(t0sz) + .with_tg0(tg0.0) + .with_ips(ips.0) + .with_aa64(true) + .with_asid(1), + qw1: CdDw1::new().with_ttb0(ttb0 >> 4), + _qw2: 0, + mair0: 0xFF440C0400, + mair1: 0, + _qw5_7: [0; 3], + } + } + + /// Write an STE to guest memory at the given stream ID. + fn write_ste(gm: &GuestMemory, sid: u32, ste: &Ste) { + let addr = STRTAB_BASE + (sid as u64) * (STE_SIZE as u64); + gm.write_plain(addr, ste).expect("write STE"); + } + + /// Write a CD to guest memory at the given SSID offset from cd_base. + fn write_cd(gm: &GuestMemory, cd_base: u64, ssid: u32, cd: &Cd) { + let addr = cd_base + (ssid as u64) * (CD_SIZE as u64); + gm.write_plain(addr, cd).expect("write CD"); + } + + // ========================================================================= + // STE lookup tests + // ========================================================================= + + #[test] + fn test_ste_lookup_valid() { + let gm = GuestMemory::allocate(0x40_0000); + let ste = make_s1_ste(CD_BASE); + write_ste(&gm, 5, &ste); + + let result = lookup_ste(&gm, STRTAB_BASE, STRTAB_LOG2SIZE, 5); + let found = result.expect("STE lookup should succeed"); + assert!(found.valid()); + assert_eq!(found.config(), SteConfig::S1_TRANS); + assert_eq!(found.s1_context_ptr(), CD_BASE); + } + + #[test] + fn test_ste_lookup_invalid_v0() { + let gm = GuestMemory::allocate(0x40_0000); + // Write an STE with V=0. + let ste = Ste { + qw0: SteDw0::new().with_v(false), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + }; + write_ste(&gm, 3, &ste); + + let result = lookup_ste(&gm, STRTAB_BASE, STRTAB_LOG2SIZE, 3); + let fault = result.expect_err("Should fault on V=0"); + assert_eq!(fault.event.event_id(), EventId::C_BAD_STE); + assert_eq!(fault.event.sid, 3); + } + + #[test] + fn test_ste_lookup_out_of_range() { + let gm = GuestMemory::allocate(0x40_0000); + // Stream ID 2048 is out of range for log2size=10 (max 1024). + let result = lookup_ste(&gm, STRTAB_BASE, STRTAB_LOG2SIZE, 2048); + let fault = result.expect_err("Should fault on out-of-range SID"); + assert_eq!(fault.event.event_id(), EventId::C_BAD_STREAMID); + } + + // ========================================================================= + // STE config dispatch tests + // ========================================================================= + + #[test] + fn test_ste_config_abort() { + let ste = make_abort_ste(); + assert_eq!(ste_config_action(&ste), Ok(SteAction::Abort)); + } + + #[test] + fn test_ste_config_bypass() { + let ste = make_bypass_ste(); + assert_eq!(ste_config_action(&ste), Ok(SteAction::Bypass)); + } + + #[test] + fn test_ste_config_s1_trans() { + let ste = make_s1_ste(CD_BASE); + assert_eq!(ste_config_action(&ste), Ok(SteAction::S1Translate)); + } + + #[test] + fn test_ste_config_unknown() { + // Config = 0b010 is not a valid configuration. + let ste = Ste { + qw0: SteDw0::new().with_v(true).with_config(0b010), + qw1: SteDw1::new(), + _qw2_7: [0; 6], + }; + assert!(ste_config_action(&ste).is_err()); + } + + // ========================================================================= + // CD lookup tests + // ========================================================================= + + #[test] + fn test_cd_lookup_valid() { + let gm = GuestMemory::allocate(0x40_0000); + let ste = make_s1_ste(CD_BASE); + let cd = make_cd(0x3000_0000, 32, Tg0::GRAN_4K, Ips::IPS_40); + write_cd(&gm, CD_BASE, 0, &cd); + + let result = lookup_cd(&gm, &ste, 5, 0); + let found = result.expect("CD lookup should succeed"); + assert!(found.valid()); + assert!(found.aa64()); + assert_eq!(found.ttb0(), 0x3000_0000); + assert_eq!(found.t0sz(), 32); + } + + #[test] + fn test_cd_lookup_invalid_v0() { + let gm = GuestMemory::allocate(0x40_0000); + let ste = make_s1_ste(CD_BASE); + // Write a CD with V=0. + let cd = Cd { + qw0: CdDw0::new().with_v(false), + qw1: CdDw1::new(), + _qw2: 0, + mair0: 0, + mair1: 0, + _qw5_7: [0; 3], + }; + write_cd(&gm, CD_BASE, 0, &cd); + + let result = lookup_cd(&gm, &ste, 5, 0); + let fault = result.expect_err("Should fault on V=0 CD"); + assert_eq!(fault.event.event_id(), EventId::C_BAD_CD); + } + + #[test] + fn test_cd_lookup_not_aa64() { + let gm = GuestMemory::allocate(0x40_0000); + let ste = make_s1_ste(CD_BASE); + // Write a CD with AA64=0 (AArch32 — not supported). + let cd = Cd { + qw0: CdDw0::new().with_v(true).with_aa64(false), + qw1: CdDw1::new(), + _qw2: 0, + mair0: 0, + mair1: 0, + _qw5_7: [0; 3], + }; + write_cd(&gm, CD_BASE, 0, &cd); + + let result = lookup_cd(&gm, &ste, 5, 0); + let fault = result.expect_err("Should fault on non-AA64 CD"); + assert_eq!(fault.event.event_id(), EventId::C_BAD_CD); + } + + // ========================================================================= + // Translation context tests + // ========================================================================= + + #[test] + fn test_translation_context_4k() { + let cd = make_cd(0x4000_0000, 32, Tg0::GRAN_4K, Ips::IPS_40); + let ctx = translation_context(&cd, 0).expect("should succeed"); + assert_eq!(ctx.ttb0, 0x4000_0000); + assert_eq!(ctx.t0sz, 32); + assert_eq!(ctx.tg0, Tg0::GRAN_4K); + assert_eq!(ctx.oas_bits, 40); + assert_eq!(ctx._asid, 1); + } + + #[test] + fn test_translation_context_16k() { + let cd = make_cd(0x8000_0000, 28, Tg0::GRAN_16K, Ips::IPS_48); + let ctx = translation_context(&cd, 0).expect("should succeed"); + assert_eq!(ctx.tg0, Tg0::GRAN_16K); + assert_eq!(ctx.oas_bits, 48); + assert_eq!(ctx.t0sz, 28); + } + + #[test] + fn test_translation_context_bad_granule() { + // TG0 = 0b11 is reserved/invalid. + let cd = Cd { + qw0: CdDw0::new() + .with_v(true) + .with_t0sz(32) + .with_tg0(0b11) // invalid + .with_ips(Ips::IPS_40.0) + .with_aa64(true), + qw1: CdDw1::new(), + _qw2: 0, + mair0: 0, + mair1: 0, + _qw5_7: [0; 3], + }; + let result = translation_context(&cd, 0); + assert!(result.is_err()); + } + + #[test] + fn test_translation_context_bad_ips() { + // IPS = 0b111 is reserved/invalid. + let cd = Cd { + qw0: CdDw0::new() + .with_v(true) + .with_t0sz(32) + .with_tg0(Tg0::GRAN_4K.0) + .with_ips(0b111) // invalid + .with_aa64(true), + qw1: CdDw1::new(), + _qw2: 0, + mair0: 0, + mair1: 0, + _qw5_7: [0; 3], + }; + let result = translation_context(&cd, 0); + assert!(result.is_err()); + } + + #[test] + fn test_translation_context_epd0() { + // EPD0=1 disables TTB0 walks. + let cd = Cd { + qw0: CdDw0::new() + .with_v(true) + .with_t0sz(32) + .with_tg0(Tg0::GRAN_4K.0) + .with_ips(Ips::IPS_40.0) + .with_aa64(true) + .with_epd0(true), + qw1: CdDw1::new(), + _qw2: 0, + mair0: 0, + mair1: 0, + _qw5_7: [0; 3], + }; + let result = translation_context(&cd, 0); + assert!(result.is_err()); + } + + // ========================================================================= + // Page table walker tests + // ========================================================================= + + // Page table memory layout constants. + const PT_L0_BASE: u64 = 0x30_0000; // L0 table + const PT_L1_BASE: u64 = 0x30_1000; // L1 table + const PT_L2_BASE: u64 = 0x30_2000; // L2 table + const PT_L3_BASE: u64 = 0x30_3000; // L3 table + const DATA_GPA: u64 = 0x4000_0000; // Target GPA for mappings + + /// Build a TranslationContext for 4K granule, T0SZ=32 (32-bit VA), 40-bit OAS. + fn make_4k_ctx(ttb0: u64) -> TranslationContext { + TranslationContext { + ttb0, + t0sz: 32, + tg0: Tg0::GRAN_4K, + oas_bits: 40, + _mair0: 0xFF440C0400, + _asid: 1, + } + } + + /// Write a page table descriptor at the given address. + fn write_pt_desc(gm: &GuestMemory, addr: u64, desc: u64) { + gm.write_plain(addr, &desc).expect("write PT desc"); + } + + /// Build a table descriptor pointing to the given next-level table address. + fn table_desc(next_table: u64) -> u64 { + // Valid=1, Type=1 (table), address in bits [47:12]. + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) // table + .with_addr_bits(next_table >> 12); + desc.into() + } + + /// Build a block descriptor for a given output address with RW, AF set. + fn block_desc(output_addr: u64) -> u64 { + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(false) // block + .with_af(true) + .with_ap(ApBits::RW_EL1.0) + .with_addr_bits(output_addr >> 12); + desc.into() + } + + /// Build a page descriptor (L3) for a given output address with RW, AF set. + fn page_desc(output_addr: u64) -> u64 { + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) // page at L3 + .with_af(true) + .with_ap(ApBits::RW_EL1.0) + .with_addr_bits(output_addr >> 12); + desc.into() + } + + /// Build a read-only page descriptor. + fn ro_page_desc(output_addr: u64) -> u64 { + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_af(true) + .with_ap(ApBits::RO_EL1.0) + .with_addr_bits(output_addr >> 12); + desc.into() + } + + /// Build a page descriptor with AF=0 (access flag not set). + fn no_af_page_desc(output_addr: u64) -> u64 { + let desc = PtDesc::new() + .with_valid(true) + .with_desc_type(true) + .with_af(false) + .with_ap(ApBits::RW_EL1.0) + .with_addr_bits(output_addr >> 12); + desc.into() + } + + #[test] + fn test_walk_4k_single_level_block() { + // T0SZ=32 with 4K granule: 32-bit VA space. + // Walk starts at level 1 (levels 1, 2, 3). + // Map a 1GB block at level 1 entry 0 → DATA_GPA. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // Level 1 entry 0: 1GB block → DATA_GPA. + write_pt_desc(&gm, PT_L1_BASE, block_desc(DATA_GPA)); + + let result = walk_s1(&gm, &ctx, 0, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, DATA_GPA); + assert_eq!(tr.page_size, 1 << 30); // 1GB block + } + + #[test] + fn test_walk_4k_four_levels() { + // T0SZ=16 with 4K granule: 48-bit VA space, 4 levels (0-3). + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = TranslationContext { + ttb0: PT_L0_BASE, + t0sz: 16, + tg0: Tg0::GRAN_4K, + oas_bits: 48, + _mair0: 0, + _asid: 0, + }; + + // L0[0] → L1 table + write_pt_desc(&gm, PT_L0_BASE, table_desc(PT_L1_BASE)); + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 table + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[0] → page at DATA_GPA + write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA)); + + let result = walk_s1(&gm, &ctx, 0, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, DATA_GPA); + assert_eq!(tr.page_size, 4096); + } + + #[test] + fn test_walk_4k_2mb_block() { + // T0SZ=32, 4K granule. Level 2 block descriptor (2MB). + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → 2MB block at DATA_GPA + write_pt_desc(&gm, PT_L2_BASE, block_desc(DATA_GPA)); + + let result = walk_s1(&gm, &ctx, 0, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, DATA_GPA); + assert_eq!(tr.page_size, 2 << 20); // 2MB + } + + #[test] + fn test_walk_4k_page_with_offset() { + // Walk to a 4K page and verify the intra-page offset is preserved. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 table + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[0] → page at DATA_GPA + write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA)); + + // Access IOVA 0x0000_0100 — should map to DATA_GPA + 0x100. + let result = walk_s1(&gm, &ctx, 0x100, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, DATA_GPA + 0x100); + assert_eq!(tr.page_size, 4096); + } + + #[test] + fn test_walk_4k_block_with_offset() { + // Walk to a 2MB block and verify the intra-block offset is preserved. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → 2MB block at DATA_GPA + write_pt_desc(&gm, PT_L2_BASE, block_desc(DATA_GPA)); + + // Access IOVA 0x0001_2345 — should map to DATA_GPA + 0x0001_2345. + let result = walk_s1(&gm, &ctx, 0x0001_2345, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, DATA_GPA + 0x0001_2345); + assert_eq!(tr.page_size, 2 << 20); + } + + #[test] + fn test_walk_fault_unmapped() { + // Walk with a PTE that has Valid=0. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] is all zeros (invalid). + let result = walk_s1(&gm, &ctx, 0, false, 42); + let fault = result.expect_err("should fault"); + assert_eq!(fault.event.event_id(), EventId::F_TRANSLATION); + assert_eq!(fault.event.sid, 42); + } + + #[test] + fn test_walk_fault_permission() { + // Write to a read-only page. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 table + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[0] → read-only page + write_pt_desc(&gm, PT_L3_BASE, ro_page_desc(DATA_GPA)); + + // Read should succeed. + let result = walk_s1(&gm, &ctx, 0, false, 0); + assert!(result.is_ok()); + + // Write should fault. + let result = walk_s1(&gm, &ctx, 0, true, 0); + let fault = result.expect_err("should fault on write to RO"); + assert_eq!(fault.event.event_id(), EventId::F_PERMISSION); + } + + #[test] + fn test_walk_fault_access_flag() { + // Page with AF=0 — should produce F_ACCESS fault. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 table + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[0] → page with AF=0 + write_pt_desc(&gm, PT_L3_BASE, no_af_page_desc(DATA_GPA)); + + let result = walk_s1(&gm, &ctx, 0, false, 0); + let fault = result.expect_err("should fault on AF=0"); + assert_eq!(fault.event.event_id(), EventId::F_ACCESS); + } + + #[test] + fn test_walk_fault_addr_size() { + // Output address exceeds OAS. + let gm = GuestMemory::allocate(0x5000_0000); + // 32-bit OAS — output addresses must fit in 32 bits. + let ctx = TranslationContext { + ttb0: PT_L1_BASE, + t0sz: 32, + tg0: Tg0::GRAN_4K, + oas_bits: 32, + _mair0: 0, + _asid: 0, + }; + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 table + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[0] → page at a high address (exceeds 32-bit OAS) + let high_addr = 0x2_0000_0000u64; // 8GB, exceeds 32-bit + write_pt_desc(&gm, PT_L3_BASE, page_desc(high_addr)); + + let result = walk_s1(&gm, &ctx, 0, false, 0); + let fault = result.expect_err("should fault on addr size"); + assert_eq!(fault.event.event_id(), EventId::F_ADDR_SIZE); + } + + #[test] + fn test_walk_iova_out_of_range() { + // IOVA exceeds the VA range defined by T0SZ. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); // T0SZ=32, VA range = 2^32 + + // IOVA = 0x1_0000_0000 (exceeds 32-bit range). + let result = walk_s1(&gm, &ctx, 0x1_0000_0000, false, 0); + let fault = result.expect_err("should fault on out-of-range IOVA"); + assert_eq!(fault.event.event_id(), EventId::F_TRANSLATION); + } + + #[test] + fn test_walk_nonzero_l1_index() { + // Verify that non-zero L1 indices work correctly. + // T0SZ=32, 4K: L1 has 4 entries (indices 0-3, each covering 1GB). + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[2] → 1GB block at DATA_GPA (IOVA starting at 2GB). + let l1_entry2_addr = PT_L1_BASE + 2 * 8; + write_pt_desc(&gm, l1_entry2_addr, block_desc(DATA_GPA)); + + // IOVA = 0x8000_0000 (2GB) should use L1 index 2. + let result = walk_s1(&gm, &ctx, 0x8000_0000, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, DATA_GPA); + assert_eq!(tr.page_size, 1 << 30); + } + + #[test] + fn test_walk_nonzero_l3_index() { + // Verify non-zero L3 index with 4K pages. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + // L1[0] → L2 table + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + // L2[0] → L3 table + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + // L3[5] → page at DATA_GPA + 0x5000 + let target = DATA_GPA + 0x5000; + write_pt_desc(&gm, PT_L3_BASE + 5 * 8, page_desc(target)); + + // IOVA = 0x5000 (L3 index 5) + offset 0x42. + let result = walk_s1(&gm, &ctx, 0x5042, false, 0); + let tr = result.expect("should translate"); + assert_eq!(tr.gpa, target + 0x42); + assert_eq!(tr.page_size, 4096); + } + + #[test] + fn test_walk_write_to_rw_page() { + // Write to a RW page should succeed. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = make_4k_ctx(PT_L1_BASE); + + write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE)); + write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE)); + write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA)); + + let result = walk_s1(&gm, &ctx, 0, true, 0); + let tr = result.expect("write to RW page should succeed"); + assert_eq!(tr.gpa, DATA_GPA); + } + + #[test] + fn test_compute_start_level_4k() { + // T0SZ=32, 4K: VA bits=32, resolve=20, levels=ceil(20/9)=3, start=1 + assert_eq!(compute_start_level(Tg0::GRAN_4K, 32), Some((1, 32))); + // T0SZ=16, 4K: VA bits=48, resolve=36, levels=4, start=0 + assert_eq!(compute_start_level(Tg0::GRAN_4K, 16), Some((0, 48))); + // T0SZ=25, 4K: VA bits=39, resolve=27, levels=3, start=1 + assert_eq!(compute_start_level(Tg0::GRAN_4K, 25), Some((1, 39))); + } + + #[test] + fn test_walk_degenerate_t0sz_returns_fault() { + // 64KB granule with T0SZ=48 produces resolve_bits=0. Without + // the guard in compute_start_level, walk_s1 would compute + // start_level=4 and then evaluate (3u32 - 4u32), panicking + // in debug mode. Verify it returns a translation fault instead. + let gm = GuestMemory::allocate(0x5000_0000); + let ctx = TranslationContext { + ttb0: PT_L1_BASE, + t0sz: 48, + tg0: Tg0::GRAN_64K, + oas_bits: 40, + _mair0: 0, + _asid: 0, + }; + + let result = walk_s1(&gm, &ctx, 0, false, 99); + let fault = result.expect_err("degenerate T0SZ must fault, not panic"); + assert_eq!(fault.event.event_id(), EventId::F_TRANSLATION); + assert_eq!(fault.event.sid, 99); + } +} diff --git a/vm/devices/pci/pci_core/src/bus_range.rs b/vm/devices/pci/pci_core/src/bus_range.rs index 3dcc1c2f92..1412992253 100644 --- a/vm/devices/pci/pci_core/src/bus_range.rs +++ b/vm/devices/pci/pci_core/src/bus_range.rs @@ -9,10 +9,8 @@ //! [`ConfigSpaceType1Emulator`](crate::cfg_space_emu::ConfigSpaceType1Emulator) //! when the guest writes bus number registers, and on restore/reset. //! -//! Consumers (ITS wrappers, SMMU) compose a full device identity from the -//! bus range plus the device's BDF. The segment number is not included -//! here — it is a static property of the root complex and is held -//! separately by the consumer. +//! Consumers (ITS wrappers, SMMU) read the bus range to compose a full +//! device identity from the bus range plus the device's BDF. use std::sync::Arc; use std::sync::atomic::AtomicU16; diff --git a/vmm_core/src/acpi_builder.rs b/vmm_core/src/acpi_builder.rs index 49503f2f4e..98e622a372 100644 --- a/vmm_core/src/acpi_builder.rs +++ b/vmm_core/src/acpi_builder.rs @@ -22,6 +22,25 @@ use vm_topology::processor::x86::X86Topology; use x86defs::apic::APIC_BASE_ADDRESS; use zerocopy::IntoBytes; +/// Configuration for the SMMUv3 ACPI IORT node. +#[derive(Debug, Clone)] +pub struct AcpiSmmuConfig { + /// Index of the root complex this SMMU covers (matches + /// `PcieHostBridge.index`). Used to route each RC's IORT ID mapping + /// to its specific SMMU node. + pub rc_index: u32, + /// PCIe segment number of the root complex this SMMU covers. Used as + /// the output_base in the SMMU→ITS ID mapping to produce globally + /// unique ITS device IDs: `(segment << 16) | BDF`. + pub segment: u16, + /// MMIO base address of the SMMU. + pub base: u64, + /// GIC SPI INTID for the event queue interrupt. + pub event_gsiv: u32, + /// GIC SPI INTID for the global error interrupt. + pub gerr_gsiv: u32, +} + /// Binary ACPI tables constructed by [`AcpiTablesBuilder`]. pub struct BuiltAcpiTables { /// The RDSP. Assumed to be given a whole page. @@ -75,6 +94,9 @@ pub enum AcpiArchConfig { hypervisor_vendor_identity: u64, /// Virtual timer PPI (GIC INTID). virt_timer_ppi: u32, + /// SMMUv3 instances. Each entry adds an SMMUv3 IORT node for the + /// specified PCI segment. Empty means no SMMU. + smmu: Vec, }, } @@ -365,13 +387,20 @@ impl AcpiTablesBuilder<'_, T> { let its_id = T::iort_its_id(self.processor_topology); let has_its = its_id.is_some(); + let smmu_configs: &[AcpiSmmuConfig] = match &self.arch { + AcpiArchConfig::Aarch64 { smmu, .. } => smmu.as_slice(), + _ => &[], + }; + let has_smmu = !smmu_configs.is_empty(); let its_node_count: u32 = if has_its { 1 } else { 0 }; - let node_count = its_node_count + self.pcie_host_bridges.len() as u32; - let mapping_count: u32 = if has_its { 1 } else { 0 }; + let smmu_node_count = smmu_configs.len() as u32; + let node_count = its_node_count + smmu_node_count + self.pcie_host_bridges.len() as u32; + // Each RC gets one ID mapping when there's a target node (SMMU or ITS). + let rc_mapping_count: u32 = if has_smmu || has_its { 1 } else { 0 }; let mut iort_extra: Vec = Vec::new(); - // ITS Group node comes first so root complexes can reference it. + // ITS Group node comes first so other nodes can reference it. // The ITS Group node offset (from table start) is IORT_NODE_OFFSET. let its_group_offset = iort::IORT_NODE_OFFSET; if let Some(id) = its_id { @@ -380,21 +409,103 @@ impl AcpiTablesBuilder<'_, T> { iort_extra.extend_from_slice(&id.to_ne_bytes()); } + // SMMUv3 nodes come after ITS Group (if present). + // Build a map from RC index → SMMU node offset for RC routing. + let mut smmu_rc_offsets: Vec<(u32, u32)> = Vec::new(); + for cfg in smmu_configs { + let smmu_node_offset = iort::IORT_NODE_OFFSET + iort_extra.len() as u32; + smmu_rc_offsets.push((cfg.rc_index, smmu_node_offset)); + + if has_its { + // The SMMUv3 node needs two ID mappings when ITS is present: + // + // [0] Range mapping: translates PCI device stream IDs through + // the SMMU to the ITS. Used by iort_node_map_id() during + // RC → SMMUv3 → ITS traversal for PCI MSI domain discovery. + // + // [1] Single mapping: identifies the ITS group for the SMMU's + // own MSI domain lookup. Referenced by + // device_id_mapping_index. Linux's iort_set_device_domain() + // requires IORT_ID_SINGLE_MAPPING flag on this entry. + // + // Both mappings are needed even though the SMMU uses wired SPIs + // (IDR0.MSI=0, GSIVs populated) for its own interrupts. The + // device_id_mapping is required for Linux's IORT MSI domain + // resolution infrastructure, which is independent of the + // SMMU's actual interrupt delivery mechanism. + let smmu = iort::IortSmmuV3::new_with_device_id_mapping( + 0, + cfg.base, + 2, + cfg.event_gsiv, + cfg.gerr_gsiv, + 1, // device_id_mapping_index → mapping [1] + ); + iort_extra.extend_from_slice(smmu.as_bytes()); + + // Mapping [0]: range mapping for PCI device stream IDs. + // The output_base applies the segment offset so the ITS + // receives globally unique device IDs: (segment << 16) | BDF. + // Stream IDs within this SMMU are plain BDFs (0-based). + iort_extra.extend_from_slice( + iort::IortIdMapping::new( + 0, // input_base + 0xFFFF, // id_count (16-bit BDF range) + (cfg.segment as u32) << 16, // output_base + its_group_offset, // output_reference → ITS group + 0, // flags + ) + .as_bytes(), + ); + + // Mapping [1]: single mapping for the SMMU's MSI domain. + iort_extra.extend_from_slice( + iort::IortIdMapping::new( + 0, // input_base (unused) + 0, // id_count (unused) + 0, // output_base (device ID) + its_group_offset, // output_reference → ITS group + iort::IORT_ID_SINGLE_MAPPING, // flags + ) + .as_bytes(), + ); + } else { + let smmu = iort::IortSmmuV3::new(0, cfg.base, 0, cfg.event_gsiv, cfg.gerr_gsiv); + iort_extra.extend_from_slice(smmu.as_bytes()); + } + } + for bridge in self.pcie_host_bridges { - let rc = iort::IortPciRootComplex::new(bridge.index, bridge.segment, mapping_count); + let rc = iort::IortPciRootComplex::new(bridge.index, bridge.segment, rc_mapping_count); iort_extra.extend_from_slice(rc.as_bytes()); - if has_its { - // Single ID mapping: full RID range → ITS Group node. - // output_base uses (segment << 16) so device IDs in the - // ITS namespace are unique across PCI segments. + if rc_mapping_count > 0 { + // Route this RC to its SMMU if one exists, + // otherwise directly to the ITS group. + let (rc_target_offset, has_smmu) = smmu_rc_offsets + .iter() + .find(|(idx, _)| *idx == bridge.index) + .map(|(_, off)| (*off, true)) + .unwrap_or((its_group_offset, false)); + + // When the RC has an SMMU, output_base is 0 because stream + // IDs are plain BDFs within the per-RC SMMU. The segment + // offset is applied in the SMMU→ITS mapping instead. + // When the RC goes directly to the ITS, output_base embeds + // the segment for globally unique ITS device IDs. + let output_base = if has_smmu { + 0 + } else { + (bridge.segment as u32) << 16 + }; + iort_extra.extend_from_slice( iort::IortIdMapping::new( - 0, // input_base - 0xFFFF, // id_count (full 16-bit BDF range, minus 1 per IORT spec) - (bridge.segment as u32) << 16, // output_base - its_group_offset, // output_reference - 0, // flags + 0, // input_base + 0xFFFF, // id_count (full 16-bit BDF range) + output_base, // output_base + rc_target_offset, // output_reference + 0, // flags ) .as_bytes(), ); @@ -1009,6 +1120,7 @@ mod test { arch: AcpiArchConfig::Aarch64 { hypervisor_vendor_identity: 0, virt_timer_ppi: 20, + smmu: vec![], }, } } @@ -1144,4 +1256,250 @@ mod test { assert!(contains_signature(&tables.tables, b"MCFG")); assert!(contains_signature(&tables.tables, b"IORT")); } + + fn new_aarch64_builder_with_smmu<'a>( + mem_layout: &'a MemoryLayout, + processor_topology: &'a ProcessorTopology, + pcie_host_bridges: &'a Vec, + smmu_base: u64, + ) -> AcpiTablesBuilder<'a, Aarch64Topology> { + AcpiTablesBuilder { + processor_topology, + mem_layout, + cache_topology: None, + pcie_host_bridges, + arch: AcpiArchConfig::Aarch64 { + hypervisor_vendor_identity: 0, + virt_timer_ppi: 20, + smmu: vec![AcpiSmmuConfig { + rc_index: 0, + segment: 0, + base: smmu_base, + event_gsiv: 35, + gerr_gsiv: 36, + }], + }, + } + } + + fn u64_at(data: &[u8], offset: usize) -> u64 { + u64::from_ne_bytes(data[offset..offset + 8].try_into().unwrap()) + } + + fn u16_at(data: &[u8], offset: usize) -> u16 { + u16::from_ne_bytes(data[offset..offset + 2].try_into().unwrap()) + } + + #[test] + fn test_iort_with_smmu_and_its() { + use acpi_spec::iort; + + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let smmu_base: u64 = 0xEFFA_0000; + let pcie_host_bridges = vec![PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }]; + let builder = new_aarch64_builder_with_smmu(&mem, &topology, &pcie_host_bridges, smmu_base); + + let data = builder.build_iort().unwrap(); + + // IORT header + assert_eq!(&data[0..4], b"IORT"); + assert_eq!(u32_at(&data, 4) as usize, data.len()); + assert_eq!(checksum(&data), 0); + + // 3 nodes: ITS Group + SMMUv3 + 1 RC + assert_eq!(u32_at(&data, 36), 3); + + // First node: ITS Group at IORT_NODE_OFFSET + let its_node = iort::IORT_NODE_OFFSET as usize; + assert_eq!(data[its_node], iort::IORT_NODE_TYPE_ITS_GROUP); + let its_group_size = 24usize; // 20-byte struct + 4-byte ITS ID + + // Second node: SMMUv3 + let smmu_node = its_node + its_group_size; + assert_eq!(data[smmu_node], iort::IORT_NODE_TYPE_SMMUV3); + // base_address at offset 16 from node start + assert_eq!(u64_at(&data, smmu_node + 16), smmu_base); + // flags: COHACC | DEVICEID_VALID (has ITS mappings) + assert_eq!( + u32_at(&data, smmu_node + 24), + iort::IORT_SMMUV3_FLAG_COHACC | iort::IORT_SMMUV3_FLAG_DEVICEID_VALID + ); + // model: 0 (generic) + assert_eq!(u32_at(&data, smmu_node + 36), 0); + // mapping_count = 2 (range + single for MSI domain) + assert_eq!(u32_at(&data, smmu_node + 8), 2); + // device_id_mapping_index = 1 + assert_eq!(u32_at(&data, smmu_node + 64), 1); + // SMMU mapping [0]: range mapping for PCI device stream IDs + let smmu_node_len = u16_at(&data, smmu_node + 1) as usize; + let smmu_mapping_0 = smmu_node + 68; // IortSmmuV3 is 68 bytes + assert_eq!(u32_at(&data, smmu_mapping_0 + 12), iort::IORT_NODE_OFFSET); // → ITS group + assert_eq!(u32_at(&data, smmu_mapping_0 + 16), 0); // flags: no SINGLE_MAPPING + // SMMU mapping [1]: single mapping for SMMU's own MSI domain + let smmu_mapping_1 = smmu_mapping_0 + 20; // IortIdMapping is 20 bytes + assert_eq!(u32_at(&data, smmu_mapping_1 + 12), iort::IORT_NODE_OFFSET); // → ITS group + assert_eq!( + u32_at(&data, smmu_mapping_1 + 16), + iort::IORT_ID_SINGLE_MAPPING + ); // flags + + // Third node: Root Complex + let rc_node = smmu_node + smmu_node_len; + assert_eq!(data[rc_node], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX); + assert_eq!(u32_at(&data, rc_node + 8), 1); // mapping_count + // RC → SMMUv3 mapping + let rc_mapping = rc_node + 36; + assert_eq!(u32_at(&data, rc_mapping), 0); // input_base + assert_eq!(u32_at(&data, rc_mapping + 4), 0xFFFF); // id_count + assert_eq!(u32_at(&data, rc_mapping + 8), 0); // output_base (0: has SMMU) + assert_eq!(u32_at(&data, rc_mapping + 12), smmu_node as u32); // → SMMUv3 + } + + #[test] + fn test_iort_with_smmu_multi_rc() { + use acpi_spec::iort; + + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let smmu_base: u64 = 0xEFFA_0000; + let pcie_host_bridges = vec![ + PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }, + PcieHostBridge { + index: 1, + segment: 2, + start_bus: 0, + end_bus: 63, + ecam_range: MemoryRange::new(5 * GB..5 * GB + 64 * 256 * 4096), + low_mmio: MemoryRange::new(0xe0000000..0xe4000000), + high_mmio: MemoryRange::new(0x1040000000..0x1080000000), + }, + ]; + let builder = new_aarch64_builder_with_smmu(&mem, &topology, &pcie_host_bridges, smmu_base); + + let data = builder.build_iort().unwrap(); + + // 4 nodes: ITS + SMMUv3 + 2 RCs + assert_eq!(u32_at(&data, 36), 4); + assert_eq!(checksum(&data), 0); + + // ITS Group + let its_node = iort::IORT_NODE_OFFSET as usize; + let its_group_size = 24usize; + + // SMMUv3 node + let smmu_node = its_node + its_group_size; + assert_eq!(data[smmu_node], iort::IORT_NODE_TYPE_SMMUV3); + let smmu_node_len = u16_at(&data, smmu_node + 1) as usize; + + // RC 0: segment 0 → SMMUv3 + let rc0 = smmu_node + smmu_node_len; + assert_eq!(data[rc0], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX); + let rc0_mapping = rc0 + 36; + assert_eq!(u32_at(&data, rc0_mapping + 8), 0); // output_base (0: has SMMU) + assert_eq!(u32_at(&data, rc0_mapping + 12), smmu_node as u32); // → SMMUv3 + + // RC 1: segment 2 → ITS directly (only segment 0 uses SMMU) + let rc0_len = u16_at(&data, rc0 + 1) as usize; + let rc1 = rc0 + rc0_len; + assert_eq!(data[rc1], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX); + let rc1_mapping = rc1 + 36; + assert_eq!(u32_at(&data, rc1_mapping + 8), 2 << 16); // output_base seg 2 + assert_eq!(u32_at(&data, rc1_mapping + 12), its_node as u32); // → ITS group + } + + #[test] + fn test_iort_without_smmu_unchanged() { + // Verify the no-SMMU case still produces RC→ITS directly (regression). + use acpi_spec::iort; + + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let pcie_host_bridges = vec![PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }]; + let builder = new_aarch64_builder(&mem, &topology, &pcie_host_bridges); + + let data = builder.build_iort().unwrap(); + + // 2 nodes: ITS Group + RC (no SMMUv3) + assert_eq!(u32_at(&data, 36), 2); + + // RC mapping points directly to ITS group + let its_node = iort::IORT_NODE_OFFSET as usize; + let rc_node = its_node + 24; // ITS group = 24 bytes + assert_eq!(data[rc_node], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX); + let rc_mapping = rc_node + 36; + assert_eq!(u32_at(&data, rc_mapping + 12), iort::IORT_NODE_OFFSET); // → ITS group + } + + #[test] + fn test_iort_smmuv3_node_fields() { + use acpi_spec::iort; + + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let smmu_base: u64 = 0xEFFA_0000; + let pcie_host_bridges = vec![PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }]; + let builder = new_aarch64_builder_with_smmu(&mem, &topology, &pcie_host_bridges, smmu_base); + + let data = builder.build_iort().unwrap(); + + let smmu_node = iort::IORT_NODE_OFFSET as usize + 24; // after ITS group + // Node type + assert_eq!(data[smmu_node], iort::IORT_NODE_TYPE_SMMUV3); + // Revision + assert_eq!(data[smmu_node + 3], iort::IORT_SMMUV3_REVISION); + // Base address + assert_eq!(u64_at(&data, smmu_node + 16), smmu_base); + // Flags: COHACC | DEVICEID_VALID + assert_eq!( + u32_at(&data, smmu_node + 24), + iort::IORT_SMMUV3_FLAG_COHACC | iort::IORT_SMMUV3_FLAG_DEVICEID_VALID + ); + // Reserved + assert_eq!(u32_at(&data, smmu_node + 28), 0); + // VATOS address = 0 + assert_eq!(u64_at(&data, smmu_node + 32), 0); + // Model = 0 (generic) + assert_eq!( + u32_at(&data, smmu_node + 40), + iort::IORT_SMMUV3_MODEL_GENERIC + ); + // GSIVs: wired SPIs for event and gerror + assert_eq!(u32_at(&data, smmu_node + 44), 35); // event_gsiv + assert_eq!(u32_at(&data, smmu_node + 48), 0); // pri_gsiv + assert_eq!(u32_at(&data, smmu_node + 52), 36); // gerr_gsiv + assert_eq!(u32_at(&data, smmu_node + 56), 0); // sync_gsiv + } } diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs index 77b04fdd52..a9c1bfba41 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs @@ -493,3 +493,111 @@ async fn pcie_nvme_boot(config: PetriVmBuilder) -> anyhow:: vm.wait_for_clean_teardown().await?; Ok(()) } + +/// Test SMMUv3 IOMMU emulation with a mixed topology: +/// +/// - Root complex s0rc0 (segment 0): SMMU enabled, virtio-net + NVMe behind it +/// - Root complex s1rc0 (segment 1): no SMMU, virtio-net behind it +/// +/// Verifies: +/// 1. Linux discovers the SMMUv3 (dmesg shows arm-smmu-v3 init) +/// 2. IORT ACPI table is present +/// 3. Devices behind the SMMU RC are in IOMMU groups +/// 4. Devices on both RCs enumerate and function (block I/O, network interfaces) +/// 5. DMA through SMMU works (NVMe I/O behind the SMMU) +#[openvmm_test(linux_direct_aarch64)] +async fn smmu_mixed_topology(config: PetriVmBuilder) -> anyhow::Result<()> { + let (vm, agent) = config + .modify_backend(|b| { + b.with_pcie_root_topology(2, 1, 4) // 2 segments, 1 RC each, 4 ports each + .with_smmu(&["s0rc0"]) // SMMU only on segment 0's RC + .with_pcie_nvme("s0rc0rp0", PCIE_NVME_SUBSYSTEM_IDS[0]) + .with_virtio_nic("s0rc0rp1") + .with_pcie_nvme("s1rc0rp0", PCIE_NVME_SUBSYSTEM_IDS[1]) + .with_virtio_nic("s1rc0rp1") + }) + .run() + .await?; + + let sh = agent.unix_shell(); + + // 1. Verify SMMUv3 is discovered by Linux + let dmesg = cmd!(sh, "dmesg").read().await?; + tracing::info!(dmesg_len = dmesg.len(), "dmesg captured"); + + let smmu_lines: Vec<&str> = dmesg + .lines() + .filter(|l| l.contains("smmu") || l.contains("SMMU") || l.contains("arm-smmu")) + .collect(); + tracing::info!(?smmu_lines, "SMMU-related dmesg lines"); + assert!( + dmesg.contains("arm-smmu-v3"), + "Linux should discover the SMMUv3 in dmesg. SMMU lines:\n{}", + smmu_lines.join("\n") + ); + + // 2. Verify IORT ACPI table is present + let acpi_tables = cmd!(sh, "ls /sys/firmware/acpi/tables/").read().await?; + assert!( + acpi_tables.contains("IORT"), + "IORT ACPI table should be present. Tables: {acpi_tables}" + ); + + // 3. Verify IOMMU groups exist (devices behind the SMMU RC) + let iommu_groups = cmd!(sh, "ls /sys/kernel/iommu_groups/") + .read() + .await + .unwrap_or_default(); + tracing::info!(%iommu_groups, "IOMMU groups"); + assert!( + !iommu_groups.trim().is_empty(), + "IOMMU groups should exist for devices behind the SMMU" + ); + + // 4. Verify all NVMe devices enumerate and have block devices + let block_devs = cmd!(sh, "ls /sys/block/").read().await?; + let nvme_count = block_devs + .split_whitespace() + .filter(|d| d.starts_with("nvme")) + .count(); + assert_eq!( + nvme_count, 2, + "both NVMe controllers should create block devices: {block_devs}" + ); + + // 5. Verify NVMe behind SMMU works: write and read back data + // The NVMe on s0rc0rp0 has DMA going through SMMU translation. + // If the SMMU page tables are not set up correctly, this I/O would fail. + let nvme_devs: Vec<&str> = block_devs + .split_whitespace() + .filter(|d| d.starts_with("nvme")) + .collect(); + if let Some(dev) = nvme_devs.first() { + // Write a pattern and read it back to exercise DMA through SMMU + cmd!( + sh, + "dd if=/dev/urandom of=/dev/{dev} bs=4096 count=16 oflag=direct" + ) + .read() + .await?; + cmd!( + sh, + "dd if=/dev/{dev} of=/dev/null bs=4096 count=16 iflag=direct" + ) + .read() + .await?; + } + + // 6. Verify virtio-net interfaces exist on both RCs + let net_devs = cmd!(sh, "ls /sys/class/net/").read().await?; + let net_count = net_devs.split_whitespace().filter(|d| *d != "lo").count(); + tracing::info!(%net_devs, net_count, "network devices"); + assert!( + net_count >= 2, + "at least 2 network interfaces should exist (got {net_count}): {net_devs}" + ); + + agent.power_off().await?; + vm.wait_for_clean_teardown().await?; + Ok(()) +} From a3682507eaa651c2cf80fe90213ac90d8bc8df52 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 16:22:52 -0700 Subject: [PATCH 7/7] cleanup --- openvmm/openvmm_core/src/worker/dispatch.rs | 113 +++++------------- .../src/worker/dispatch/smmu_wiring.rs | 100 ++++++++++++++++ 2 files changed, 131 insertions(+), 82 deletions(-) create mode 100644 openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index a1a9044f16..0cd1a38c54 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +mod smmu_wiring; + use crate::emuplat; use crate::partition::BindHvliteVp; use crate::partition::HvlitePartition; @@ -407,22 +409,12 @@ pub(crate) struct InitializedVm { chipset_low_mmio: MemoryRange, chipset_high_mmio: MemoryRange, vtl2_chipset_mmio: MemoryRange, - resolved_smmu_resources: Vec, + resolved_smmu_resources: Vec, processor_topology: ProcessorTopology, igvm_file: Option, driver_source: VmTaskDriverSource, } -/// Resolved resources for a single SMMUv3 instance. -struct ResolvedSmmuResources { - /// MMIO base address (from the memory layout allocator). - base: u64, - /// GIC INTID for the event queue interrupt (from the SPI allocator). - evtq_gsiv: u32, - /// GIC INTID for the global error interrupt (from the SPI allocator). - gerr_gsiv: u32, -} - trait ExtractTopologyConfig { fn to_config(&self) -> ProcessorTopologyConfig; } @@ -515,6 +507,7 @@ impl ExtractTopologyConfig for ProcessorTopology { struct Aarch64TopologyResult { processor_topology: ProcessorTopology, spi_layout: super::spi_layout::ResolvedSpiLayout, + smmu_count: usize, } #[cfg(guest_arch = "aarch64")] @@ -654,6 +647,8 @@ fn build_aarch64_topology( gic_nr_irqs, }; + let smmu_count = arch.smmu.len(); + let mut builder = TopologyBuilder::new_aarch64(platform); if let Some(smt) = config.enable_smt { builder.smt_enabled(smt); @@ -666,6 +661,7 @@ fn build_aarch64_topology( Ok(Aarch64TopologyResult { processor_topology: builder.build(config.proc_count)?, spi_layout, + smmu_count, }) } @@ -894,9 +890,13 @@ impl InitializedVm { }; #[cfg(guest_arch = "aarch64")] - let (processor_topology, spi_layout) = { + let (processor_topology, spi_layout, smmu_count) = { let result = build_aarch64_topology(&cfg.processor_topology, &platform_info)?; - (result.processor_topology, result.spi_layout) + ( + result.processor_topology, + result.spi_layout, + result.smmu_count, + ) }; #[cfg(not(guest_arch = "aarch64"))] let processor_topology = build_x86_topology(&cfg.processor_topology)?; @@ -956,22 +956,9 @@ impl InitializedVm { .filter(|(bus, _)| matches!(bus, VirtioBus::Mmio)) .count(); - // Count SMMU instances so the layout engine can allocate their MMIO. - let smmu_count = { - #[cfg(guest_arch = "aarch64")] - { - match &cfg.processor_topology.arch { - Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig { smmu, .. })) => { - smmu.len() - } - _ => 0, - } - } - #[cfg(not(guest_arch = "aarch64"))] - { - 0 - } - }; + // smmu_count was already computed by build_aarch64_topology. + #[cfg(not(guest_arch = "aarch64"))] + let smmu_count = 0; let resolved_layout = resolve_memory_layout(MemoryLayoutInput { mem_size: cfg.memory.mem_size, @@ -992,21 +979,11 @@ impl InitializedVm { let vtl2_chipset_mmio = resolved_layout.vtl2_chipset_mmio; // Combine SMMU MMIO ranges with SPI layout. - cfg_if! { - if #[cfg(guest_arch = "aarch64")] { - let resolved_smmu_resources: Vec = resolved_layout.smmu_ranges - .iter() - .zip(&spi_layout.smmu) - .map(|(range, spis)| ResolvedSmmuResources { - base: range.start(), - evtq_gsiv: spis.evtq_gsiv, - gerr_gsiv: spis.gerr_gsiv, - }) - .collect(); - } else { - let resolved_smmu_resources: Vec = Vec::new(); - } - } + #[cfg(guest_arch = "aarch64")] + let resolved_smmu_resources = + smmu_wiring::resolve_smmu_resources(&resolved_layout.smmu_ranges, &spi_layout); + #[cfg(not(guest_arch = "aarch64"))] + let resolved_smmu_resources: Vec = Vec::new(); // Place the alias map at the end of the address space. Newer versions // of OpenHCL support receiving this offset via devicetree (especially @@ -2111,40 +2088,13 @@ impl InitializedVm { }); } - // Build a port-name → SMMU shared state map. Each downstream port of - // an SMMU-covered root complex inherits that SMMU. - let smmu_port_map: std::collections::HashMap, Arc> = - smmu_shared_states - .iter() - .zip(pcie_root_complexes.iter()) - .flat_map(|(shared, rc)| { - let shared = shared.clone(); - rc.lock() - .downstream_ports() - .into_iter() - .filter_map(move |dpi| shared.as_ref().map(|s| (dpi.name, s.clone()))) - }) - .collect(); - - // Track which RCs have SMMUs (for VFIO blocking). - let mut smmu_per_rc = vec![false; pcie_host_bridges.len()]; - for inst in &smmu_instances { - if let Some(&i) = pcie_rc_name_to_idx.get(&inst.rc_name) { - smmu_per_rc[i] = true; - } - } - - // Build port-name set for ports behind SMMUs. - let smmu_s1_ports: std::collections::HashSet> = smmu_per_rc - .iter() - .zip(pcie_root_complexes.iter()) - .flat_map(|(&has_smmu, rc)| { - rc.lock() - .downstream_ports() - .into_iter() - .filter_map(move |dpi| if has_smmu { Some(dpi.name) } else { None }) - }) - .collect(); + let smmu_port_maps = smmu_wiring::build_smmu_port_maps( + &smmu_shared_states, + &pcie_root_complexes, + &smmu_instances, + &pcie_rc_name_to_idx, + &pcie_host_bridges, + ); // Resolve PCIe devices concurrently. // @@ -2163,8 +2113,7 @@ impl InitializedVm { let partition = &partition; let mapper = &mapper; let port_info = &port_info; - let smmu_port_map = &smmu_port_map; - let smmu_s1_ports = &smmu_s1_ports; + let smmu_port_maps = &smmu_port_maps; async move { let port_name: Arc = dev_cfg.port_name.into(); let pi = port_info.get(&port_name).ok_or_else(|| { @@ -2179,7 +2128,7 @@ impl InitializedVm { // into the host IOMMU, so VFIO DMA would bypass S1 // translation. This will be lifted when iommufd // nested translation support is available. - if dev_cfg.resource.id() == "vfio" && smmu_s1_ports.contains(&port_name) { + if dev_cfg.resource.id() == "vfio" && smmu_port_maps.s1_ports.contains(&port_name) { anyhow::bail!( "VFIO device on port {:?} is behind an S1-capable SMMU, \ but iommufd nested translation is not available. \ @@ -2198,7 +2147,7 @@ impl InitializedVm { &pi.bus_range, pi.segment, use_its, - smmu_port_map.get(&port_name), + smmu_port_maps.port_map.get(&port_name), ); vmm_core::device_builder::build_pcie_device( diff --git a/openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs b/openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs new file mode 100644 index 0000000000..b58e61187a --- /dev/null +++ b/openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! SMMU resource resolution and wiring helpers for aarch64 VMs. +//! +//! This module handles combining SMMU MMIO ranges (from the memory layout +//! allocator) with SPI assignments (from the SPI allocator) into resolved +//! resources, and building the lookup maps needed for per-device wiring. + +use closeable_mutex::CloseableMutex; +use pcie::root::GenericPcieRootComplex; +use std::collections::HashMap; +use std::collections::HashSet; +use std::sync::Arc; +use vm_topology::pcie::PcieHostBridge; + +/// Resolved resources for a single SMMUv3 instance, combining MMIO and SPI +/// allocations. +pub(super) struct ResolvedSmmuResources { + /// MMIO base address (from the memory layout allocator). + pub base: u64, + /// GIC INTID for the event queue interrupt (from the SPI allocator). + pub evtq_gsiv: u32, + /// GIC INTID for the global error interrupt (from the SPI allocator). + pub gerr_gsiv: u32, +} + +/// Combines SMMU MMIO ranges from the memory layout with SPI assignments from +/// the SPI layout into resolved resources. +#[cfg(guest_arch = "aarch64")] +pub(super) fn resolve_smmu_resources( + smmu_ranges: &[memory_range::MemoryRange], + spi_layout: &crate::worker::spi_layout::ResolvedSpiLayout, +) -> Vec { + smmu_ranges + .iter() + .zip(&spi_layout.smmu) + .map(|(range, spis)| ResolvedSmmuResources { + base: range.start(), + evtq_gsiv: spis.evtq_gsiv, + gerr_gsiv: spis.gerr_gsiv, + }) + .collect() +} + +/// Lookup maps for SMMU-covered PCIe ports, used during device wiring and +/// VFIO validation. +pub(super) struct SmmuPortMaps { + /// Maps port names to their SMMU shared state (for per-device wrapping). + pub port_map: HashMap, Arc>, + /// Set of port names behind S1-capable SMMUs (for VFIO blocking). + pub s1_ports: HashSet>, +} + +/// Builds the port-level SMMU lookup maps from per-RC shared state. +/// +/// `smmu_shared_states` is indexed parallel to `pcie_host_bridges` / `pcie_root_complexes`, +/// with `None` for root complexes that have no SMMU. +pub(super) fn build_smmu_port_maps( + smmu_shared_states: &[Option>], + pcie_root_complexes: &[Arc>], + smmu_instances: &[openvmm_defs::config::SmmuInstanceConfig], + pcie_rc_name_to_idx: &HashMap, + pcie_host_bridges: &[PcieHostBridge], +) -> SmmuPortMaps { + // Build a port-name → SMMU shared state map. Each downstream port of + // an SMMU-covered root complex inherits that SMMU. + let port_map: HashMap, Arc> = smmu_shared_states + .iter() + .zip(pcie_root_complexes.iter()) + .flat_map(|(shared, rc)| { + let shared = shared.clone(); + rc.lock() + .downstream_ports() + .into_iter() + .filter_map(move |dpi| shared.as_ref().map(|s| (dpi.name, s.clone()))) + }) + .collect(); + + // Track which RCs have SMMUs, then collect port names behind them. + let mut smmu_per_rc = vec![false; pcie_host_bridges.len()]; + for inst in smmu_instances { + if let Some(&i) = pcie_rc_name_to_idx.get(&inst.rc_name) { + smmu_per_rc[i] = true; + } + } + + let s1_ports: HashSet> = smmu_per_rc + .iter() + .zip(pcie_root_complexes.iter()) + .flat_map(|(&has_smmu, rc)| { + rc.lock() + .downstream_ports() + .into_iter() + .filter_map(move |dpi| if has_smmu { Some(dpi.name) } else { None }) + }) + .collect(); + + SmmuPortMaps { port_map, s1_ports } +}