From ba834301849747d884e47ec6176804a69952bea4 Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 14:16:40 -0700
Subject: [PATCH 1/7] openvmm_core: add GIC SPI layout allocator

Introduce a deterministic SPI layout resolver for aarch64 VMs, analogous
to the memory layout engine. All GIC SPI assignments for platform devices
(GICv2m MSI block) are computed in a single top-down pass over the SPI
range [64, 1019], ensuring the layout is a pure function of the VM config.
This is critical for hibernation stability.

The BuildTopology trait is replaced with cfg-gated free functions, and the
GicMsiConfig::V2m variant now carries an explicit spi_count field so the
user can control the v2m block size. The DEFAULT_GIC_V2M_SPI_BASE constant
is removed since the allocator picks the base dynamically.

This is not very interesting yet, but it will become more interesting
when we add vSMMUs (which need SPIs).
---
 openvmm/openvmm_core/src/worker/dispatch.rs   | 329 +++++++++---------
 openvmm/openvmm_core/src/worker/mod.rs        |   1 +
 openvmm/openvmm_core/src/worker/spi_layout.rs | 114 ++++++
 openvmm/openvmm_defs/src/config.rs            |  10 +-
 openvmm/openvmm_entry/src/lib.rs              |   4 +-
 5 files changed, 294 insertions(+), 164 deletions(-)
 create mode 100644 openvmm/openvmm_core/src/worker/spi_layout.rs
diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs
index 4cbe51c1b8..65557ed06b 100644
--- a/openvmm/openvmm_core/src/worker/dispatch.rs
+++ b/openvmm/openvmm_core/src/worker/dispatch.rs
@@ -34,7 +34,6 @@ use ide_resources::IdeDeviceConfig;
 use igvm::IgvmFile;
 use input_core::InputData;
 use input_core::MultiplexedInputHandle;
-use inspect::Inspect;
 use local_clock::LocalClockDelta;
 use membacking::GuestMemoryBuilder;
 use membacking::GuestMemoryManager;
@@ -113,7 +112,6 @@ use vm_resource::kind::VirtioDeviceHandle;
 use vm_resource::kind::VmbusDeviceHandleKind;
 use vm_topology::memory::MemoryLayout;
 use vm_topology::pcie::PcieHostBridge;
-use vm_topology::processor::ArchTopology;
 use vm_topology::processor::ProcessorTopology;
 use vm_topology::processor::TopologyBuilder;
 use vm_topology::processor::aarch64::Aarch64Topology;
@@ -414,13 +412,6 @@ pub(crate) struct InitializedVm {
     driver_source: VmTaskDriverSource,
 }
 
-trait BuildTopology<T: ArchTopology + Inspect> {
-    fn to_topology(
-        &self,
-        platform_info: &virt::PlatformInfo,
-    ) -> anyhow::Result<ProcessorTopology<T>>;
-}
-
 trait ExtractTopologyConfig {
     fn to_config(&self) -> ProcessorTopologyConfig;
 }
@@ -446,38 +437,35 @@ impl ExtractTopologyConfig for ProcessorTopology<X86Topology> {
 }
 
 #[cfg(guest_arch = "x86_64")]
-impl BuildTopology<X86Topology> for ProcessorTopologyConfig {
-    fn to_topology(
-        &self,
-        _platform_info: &virt::PlatformInfo,
-    ) -> anyhow::Result<ProcessorTopology<X86Topology>> {
-        use vm_topology::processor::x86::X2ApicState;
-
-        let arch = match &self.arch {
-            None => Default::default(),
-            Some(ArchTopologyConfig::X86(arch)) => arch.clone(),
-            _ => anyhow::bail!("invalid architecture config"),
-        };
-        let mut builder = TopologyBuilder::from_host_topology()?;
-        builder.apic_id_offset(arch.apic_id_offset);
-        if let Some(smt) = self.enable_smt {
-            builder.smt_enabled(smt);
-        }
-        if let Some(count) = self.vps_per_socket {
-            builder.vps_per_socket(count);
-        }
-        let x2apic = match arch.x2apic {
-            X2ApicConfig::Auto => {
-                // FUTURE: query the hypervisor for a recommendation.
-                X2ApicState::Supported
-            }
-            X2ApicConfig::Supported => X2ApicState::Supported,
-            X2ApicConfig::Unsupported => X2ApicState::Unsupported,
-            X2ApicConfig::Enabled => X2ApicState::Enabled,
-        };
-        builder.x2apic(x2apic);
-        Ok(builder.build(self.proc_count)?)
+fn build_x86_topology(
+    config: &ProcessorTopologyConfig,
+) -> anyhow::Result<ProcessorTopology<X86Topology>> {
+    use vm_topology::processor::x86::X2ApicState;
+
+    let arch = match &config.arch {
+        None => Default::default(),
+        Some(ArchTopologyConfig::X86(arch)) => arch.clone(),
+        _ => anyhow::bail!("invalid architecture config"),
+    };
+    let mut builder = TopologyBuilder::from_host_topology()?;
+    builder.apic_id_offset(arch.apic_id_offset);
+    if let Some(smt) = config.enable_smt {
+        builder.smt_enabled(smt);
     }
+    if let Some(count) = config.vps_per_socket {
+        builder.vps_per_socket(count);
+    }
+    let x2apic = match arch.x2apic {
+        X2ApicConfig::Auto => {
+            // FUTURE: query the hypervisor for a recommendation.
+            X2ApicState::Supported
+        }
+        X2ApicConfig::Supported => X2ApicState::Supported,
+        X2ApicConfig::Unsupported => X2ApicState::Unsupported,
+        X2ApicConfig::Enabled => X2ApicState::Enabled,
+    };
+    builder.x2apic(x2apic);
+    Ok(builder.build(config.proc_count)?)
 }
 
 impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
@@ -512,140 +500,107 @@ impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
 }
 
 #[cfg(guest_arch = "aarch64")]
-impl BuildTopology<Aarch64Topology> for ProcessorTopologyConfig {
-    fn to_topology(
-        &self,
-        platform_info: &virt::PlatformInfo,
-    ) -> anyhow::Result<ProcessorTopology<Aarch64Topology>> {
-        use vm_topology::processor::aarch64::Aarch64PlatformConfig;
-        use vm_topology::processor::aarch64::GicItsInfo;
-        use vm_topology::processor::aarch64::GicMsiController;
-        use vm_topology::processor::aarch64::GicV2mInfo;
-
-        let arch = match &self.arch {
-            None => Default::default(),
-            Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(),
-            _ => anyhow::bail!("invalid architecture config"),
-        };
+fn build_aarch64_topology(
+    config: &ProcessorTopologyConfig,
+    platform_info: &virt::PlatformInfo,
+    gic_msi: vm_topology::processor::aarch64::GicMsiController,
+) -> anyhow::Result<ProcessorTopology<Aarch64Topology>> {
+    use vm_topology::processor::aarch64::Aarch64PlatformConfig;
+
+    let arch = match &config.arch {
+        None => Default::default(),
+        Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(),
+        _ => anyhow::bail!("invalid architecture config"),
+    };
 
-        let pmu_gsiv = match arch.pmu_gsiv {
-            PmuGsivConfig::Disabled => None,
-            PmuGsivConfig::Gsiv(gsiv) => Some(gsiv),
-            PmuGsivConfig::Platform => platform_info.platform_gsiv,
-        };
+    let pmu_gsiv = match arch.pmu_gsiv {
+        PmuGsivConfig::Disabled => None,
+        PmuGsivConfig::Gsiv(gsiv) => Some(gsiv),
+        PmuGsivConfig::Platform => platform_info.platform_gsiv,
+    };
 
-        // TODO: When this value is supported on all platforms, we should change
-        // the arch config to not be an option. For now, warn since the ARM VBSA
-        // expects this to be available.
-        if pmu_gsiv.is_none() {
-            tracing::warn!("PMU GSIV is not set");
-        }
+    // TODO: When this value is supported on all platforms, we should change
+    // the arch config to not be an option. For now, warn since the ARM VBSA
+    // expects this to be available.
+    if pmu_gsiv.is_none() {
+        tracing::warn!("PMU GSIV is not set");
+    }
 
-        let (gic_distributor_base, gic_version) = match &arch.gic_config {
-            Some(GicConfig::V3(config)) => {
-                let dist = config
-                    .as_ref()
-                    .map(|c| c.gic_distributor_base)
-                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE);
-                let redist = config
-                    .as_ref()
-                    .map(|c| c.gic_redistributors_base)
-                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE);
+    let (gic_distributor_base, gic_version) = match &arch.gic_config {
+        Some(GicConfig::V3(config)) => {
+            let dist = config
+                .as_ref()
+                .map(|c| c.gic_distributor_base)
+                .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE);
+            let redist = config
+                .as_ref()
+                .map(|c| c.gic_redistributors_base)
+                .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE);
+            (
+                dist,
+                GicVersion::V3 {
+                    redistributors_base: redist,
+                },
+            )
+        }
+        Some(GicConfig::V2(config)) => {
+            let dist = config
+                .as_ref()
+                .map(|c| c.gic_distributor_base)
+                .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE);
+            let cpu_if = config
+                .as_ref()
+                .map(|c| c.cpu_interface_base)
+                .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE);
+            (
+                dist,
+                GicVersion::V2 {
+                    cpu_interface_base: cpu_if,
+                },
+            )
+        }
+        None => {
+            // No explicit GIC config — use the hypervisor's detected version
+            // with default addresses.
+            let dist = openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE;
+            let second = openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE;
+            if platform_info.supports_gic_v3 {
                 (
                     dist,
                     GicVersion::V3 {
-                        redistributors_base: redist,
+                        redistributors_base: second,
                     },
                 )
-            }
-            Some(GicConfig::V2(config)) => {
-                let dist = config
-                    .as_ref()
-                    .map(|c| c.gic_distributor_base)
-                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE);
-                let cpu_if = config
-                    .as_ref()
-                    .map(|c| c.cpu_interface_base)
-                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE);
+            } else {
                 (
                     dist,
                     GicVersion::V2 {
-                        cpu_interface_base: cpu_if,
+                        cpu_interface_base: second,
                     },
                 )
             }
-            None => {
-                // No explicit GIC config — use the hypervisor's detected version
-                // with default addresses.
-                let dist = openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE;
-                let second = openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE;
-                if platform_info.supports_gic_v3 {
-                    (
-                        dist,
-                        GicVersion::V3 {
-                            redistributors_base: second,
-                        },
-                    )
-                } else {
-                    (
-                        dist,
-                        GicVersion::V2 {
-                            cpu_interface_base: second,
-                        },
-                    )
-                }
-            }
-        };
-
-        // Use the ITS for MSI delivery when the backend supports it
-        // (KVM with GICv3). Otherwise fall back to GICv2m (SPI-based MSIs).
-        use openvmm_defs::config::GicMsiConfig;
-        let is_gicv2 = matches!(gic_version, GicVersion::V2 { .. });
-        let use_its = match arch.gic_msi {
-            GicMsiConfig::Auto => platform_info.supports_its && !is_gicv2,
-            GicMsiConfig::Its => {
-                if is_gicv2 {
-                    anyhow::bail!("ITS is incompatible with GICv2");
-                }
-                if !platform_info.supports_its {
-                    anyhow::bail!("ITS requested but the hypervisor does not support it");
-                }
-                true
-            }
-            GicMsiConfig::V2m => false,
-        };
-        let gic_msi = if use_its {
-            GicMsiController::Its(GicItsInfo {
-                its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
-            })
-        } else {
-            GicMsiController::V2m(GicV2mInfo {
-                frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
-                spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE,
-                spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT,
-            })
-        };
+        }
+    };
 
-        let platform = Aarch64PlatformConfig {
-            gic_distributor_base,
-            gic_version,
-            gic_msi,
-            pmu_gsiv,
-            virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI,
-            gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS,
-        };
+    let platform = Aarch64PlatformConfig {
+        gic_distributor_base,
+        gic_version,
+        gic_msi,
+        pmu_gsiv,
+        virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI,
+        gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS,
+    };
 
-        let mut builder = TopologyBuilder::new_aarch64(platform);
-        if let Some(smt) = self.enable_smt {
-            builder.smt_enabled(smt);
-        }
-        if let Some(count) = self.vps_per_socket {
-            builder.vps_per_socket(count);
-        } else {
-            builder.vps_per_socket(self.proc_count);
-        }
-        Ok(builder.build(self.proc_count)?)
+    let mut builder = TopologyBuilder::new_aarch64(platform);
+    if let Some(smt) = config.enable_smt {
+        builder.smt_enabled(smt);
+    }
+    if let Some(count) = config.vps_per_socket {
+        builder.vps_per_socket(count);
+    } else {
+        builder.vps_per_socket(config.proc_count);
     }
+    Ok(builder.build(config.proc_count)?)
 }
 
 /// A VM that has been loaded and can be run.
@@ -818,6 +773,7 @@ impl InitializedVm {
     pub(crate) async fn new_with_hypervisor<P, H>(
         driver_source: VmTaskDriverSource,
         hypervisor: &mut H,
+        #[cfg_attr(not(guest_arch = "aarch64"), expect(unused_variables))]
         platform_info: virt::PlatformInfo,
         cfg: Manifest,
         shared_memory: Option<SharedMemoryBacking>,
@@ -865,7 +821,64 @@ impl InitializedVm {
             None
         };
 
-        let processor_topology = cfg.processor_topology.to_topology(&platform_info)?;
+        cfg_if! {
+            if #[cfg(guest_arch = "aarch64")] {
+                use openvmm_defs::config::GicMsiConfig;
+                use vm_topology::processor::aarch64::GicItsInfo;
+                use vm_topology::processor::aarch64::GicMsiController;
+                use vm_topology::processor::aarch64::GicV2mInfo;
+
+                // Resolve ITS vs v2m and determine v2m SPI count.
+                let arch_config = match &cfg.processor_topology.arch {
+                    Some(ArchTopologyConfig::Aarch64(a)) => a,
+                    _ => &Aarch64TopologyConfig::default(),
+                };
+                let is_gicv2 = match &arch_config.gic_config {
+                    Some(GicConfig::V2(_)) => true,
+                    _ => !platform_info.supports_gic_v3,
+                };
+                let v2m_spi_count = match &arch_config.gic_msi {
+                    GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None,
+                    GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT),
+                    GicMsiConfig::Its => {
+                        if is_gicv2 {
+                            anyhow::bail!("ITS is incompatible with GICv2");
+                        }
+                        if !platform_info.supports_its {
+                            anyhow::bail!("ITS requested but the hypervisor does not support it");
+                        }
+                        None
+                    }
+                    GicMsiConfig::V2m { spi_count } => Some(*spi_count),
+                };
+
+                // Resolve SPI layout — all SPI allocations in one deterministic pass.
+                let spi_layout = super::spi_layout::resolve_spi_layout(
+                    &super::spi_layout::SpiLayoutInput {
+                        v2m_spi_count,
+                    },
+                )?;
+
+                // Build the GIC MSI controller from resolved SPIs.
+                let gic_msi = if let Some(count) = v2m_spi_count {
+                    GicMsiController::V2m(GicV2mInfo {
+                        frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
+                        spi_base: spi_layout.v2m_spi_base.expect("v2m base must be allocated when v2m_spi_count is Some"),
+                        spi_count: count,
+                    })
+                } else {
+                    GicMsiController::Its(GicItsInfo {
+                        its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
+                    })
+                };
+
+                let processor_topology =
+                    build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)?;
+            } else {
+                let processor_topology =
+                    build_x86_topology(&cfg.processor_topology)?;
+            }
+        }
 
         let proto = hypervisor
             .new_partition(virt::ProtoPartitionConfig {
diff --git a/openvmm/openvmm_core/src/worker/mod.rs b/openvmm/openvmm_core/src/worker/mod.rs
index b36faa2154..3df33c74e3 100644
--- a/openvmm/openvmm_core/src/worker/mod.rs
+++ b/openvmm/openvmm_core/src/worker/mod.rs
@@ -4,4 +4,5 @@
 pub mod dispatch;
 mod memory_layout;
 mod rom;
+mod spi_layout;
 pub mod vm_loaders;
diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs
new file mode 100644
index 0000000000..97065590e8
--- /dev/null
+++ b/openvmm/openvmm_core/src/worker/spi_layout.rs
@@ -0,0 +1,114 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#![cfg(guest_arch = "aarch64")]
+
+//! GIC SPI layout resolver for aarch64 VMs.
+//!
+//! This module determines the GIC SPI assignments for all platform devices
+//! that need dynamically allocated interrupts. It is the SPI analogue of
+//! [`super::memory_layout`]: all allocations happen in a single deterministic
+//! pass so that the assignments are a pure function of the VM configuration.
+//! This is critical for hibernation — a resumed VM must get the same SPI
+//! layout as the original.
+//!
+//! SPIs are allocated top-down from INTID 1019. This maximizes distance from
+//! the guest-side vPCI MSI allocator (Hyper-V PCI driver in Linux), which
+//! allocates bottom-up starting at INTID 64.
+
+/// Top-down GIC SPI allocator.
+struct SpiAllocator {
+    range_start: u32,
+    cursor: u32,
+}
+
+impl SpiAllocator {
+    fn new(range: std::ops::RangeInclusive<u32>) -> Self {
+        Self {
+            range_start: *range.start(),
+            cursor: *range.end(),
+        }
+    }
+
+    /// Allocates a single SPI, returning its GIC INTID.
+    fn alloc(&mut self, tag: &str) -> anyhow::Result<u32> {
+        if self.cursor < self.range_start {
+            anyhow::bail!("SPI exhausted allocating {tag}");
+        }
+        let intid = self.cursor;
+        self.cursor -= 1;
+        Ok(intid)
+    }
+
+    /// Allocates a contiguous block of `count` SPIs, returning the lowest
+    /// GIC INTID in the block.
+    fn alloc_block(&mut self, tag: &str, count: u32) -> anyhow::Result<u32> {
+        let available = self.cursor.saturating_sub(self.range_start) + 1;
+        if count == 0 || count > available {
+            anyhow::bail!(
+                "SPI exhausted allocating {tag}: need {count}, only {available} remaining"
+            );
+        }
+        let base = self.cursor - count + 1;
+        self.cursor = base - 1;
+        Ok(base)
+    }
+}
+
+/// Inputs to the SPI layout resolver.
+pub(super) struct SpiLayoutInput {
+    /// Number of SPIs to reserve for GICv2m MSI delivery. `None` when using
+    /// ITS (no v2m block needed).
+    pub v2m_spi_count: Option<u32>,
+}
+
+/// Resolved SPI assignments for all platform devices.
+pub(super) struct ResolvedSpiLayout {
+    /// GICv2m SPI base INTID. `None` when using ITS.
+    pub v2m_spi_base: Option<u32>,
+}
+
+/// Resolves SPI assignments for all platform devices.
+///
+/// All allocations happen here in a single top-down pass over the SPI range
+/// `[64, 1019]`. The order of allocations determines the layout and must not
+/// change across OpenVMM versions for a given config, or hibernation will
+/// break.
+pub(super) fn resolve_spi_layout(input: &SpiLayoutInput) -> anyhow::Result<ResolvedSpiLayout> {
+    let mut spi = SpiAllocator::new(64..=1019);
+
+    // --- Allocation order (do not reorder!) ---
+
+    // 1. GICv2m MSI block.
+    let v2m_spi_base = input
+        .v2m_spi_count
+        .map(|count| spi.alloc_block("gicv2m", count))
+        .transpose()?;
+
+    Ok(ResolvedSpiLayout { v2m_spi_base })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn v2m_allocation() {
+        let result = resolve_spi_layout(&SpiLayoutInput {
+            v2m_spi_count: Some(64),
+        })
+        .unwrap();
+
+        assert_eq!(result.v2m_spi_base, Some(956));
+    }
+
+    #[test]
+    fn its_skips_v2m() {
+        let result = resolve_spi_layout(&SpiLayoutInput {
+            v2m_spi_count: None,
+        })
+        .unwrap();
+
+        assert_eq!(result.v2m_spi_base, None);
+    }
+}
diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs
index 4e4af4dd4b..c2144a8216 100644
--- a/openvmm/openvmm_defs/src/config.rs
+++ b/openvmm/openvmm_defs/src/config.rs
@@ -80,10 +80,7 @@ pub const DEFAULT_GIC_V2M_MSI_FRAME_BASE: u64 = 0xEFFE_8000;
 /// Size of the v2m MSI frame (one 4KB page is the architectural minimum).
 pub const GIC_V2M_MSI_FRAME_SIZE: u64 = 0x1000;
 
-/// First GIC interrupt ID reserved for PCIe MSIs via the v2m frame.
-/// Must be in the SPI range (32–1019) and not conflict with other devices.
-pub const DEFAULT_GIC_V2M_SPI_BASE: u32 = 512;
-/// Number of SPIs reserved for PCIe MSIs.
+/// Default number of SPIs reserved for PCIe MSIs when using GICv2m.
 pub const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64;
 
 /// Base address of the GICv3 ITS MMIO region. Must be 64 KiB aligned,
@@ -296,7 +293,10 @@ pub enum GicMsiConfig {
     /// Force GICv3 ITS for MSI delivery via LPIs.
     Its,
     /// Force GICv2m for MSI delivery via SPIs.
-    V2m,
+    V2m {
+        /// Number of SPIs to reserve for PCIe MSIs.
+        spi_count: u32,
+    },
 }
 
 #[derive(Debug, Protobuf, Default, Clone)]
diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs
index 6afbc14913..542253cb3f 100644
--- a/openvmm/openvmm_entry/src/lib.rs
+++ b/openvmm/openvmm_entry/src/lib.rs
@@ -1285,7 +1285,9 @@ async fn vm_config_from_command_line(
             gic_msi: match opt.gic_msi {
                 cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto,
                 cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its,
-                cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m,
+                cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m {
+                    spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT,
+                },
             },
         },
     );

From 8e7312c173b47961c341c9d5cb0b9ce1b0ec5fb7 Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 14:24:23 -0700
Subject: [PATCH 2/7] spi_layout: remove cfg_if, use #[cfg] attributes

---
 openvmm/openvmm_core/src/worker/dispatch.rs | 106 ++++++++++----------
 1 file changed, 52 insertions(+), 54 deletions(-)

diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs
index 65557ed06b..7601fe164f 100644
--- a/openvmm/openvmm_core/src/worker/dispatch.rs
+++ b/openvmm/openvmm_core/src/worker/dispatch.rs
@@ -821,64 +821,62 @@ impl InitializedVm {
             None
         };
 
-        cfg_if! {
-            if #[cfg(guest_arch = "aarch64")] {
-                use openvmm_defs::config::GicMsiConfig;
-                use vm_topology::processor::aarch64::GicItsInfo;
-                use vm_topology::processor::aarch64::GicMsiController;
-                use vm_topology::processor::aarch64::GicV2mInfo;
-
-                // Resolve ITS vs v2m and determine v2m SPI count.
-                let arch_config = match &cfg.processor_topology.arch {
-                    Some(ArchTopologyConfig::Aarch64(a)) => a,
-                    _ => &Aarch64TopologyConfig::default(),
-                };
-                let is_gicv2 = match &arch_config.gic_config {
-                    Some(GicConfig::V2(_)) => true,
-                    _ => !platform_info.supports_gic_v3,
-                };
-                let v2m_spi_count = match &arch_config.gic_msi {
-                    GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None,
-                    GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT),
-                    GicMsiConfig::Its => {
-                        if is_gicv2 {
-                            anyhow::bail!("ITS is incompatible with GICv2");
-                        }
-                        if !platform_info.supports_its {
-                            anyhow::bail!("ITS requested but the hypervisor does not support it");
-                        }
-                        None
+        #[cfg(guest_arch = "aarch64")]
+        let processor_topology = {
+            use openvmm_defs::config::GicMsiConfig;
+            use vm_topology::processor::aarch64::GicItsInfo;
+            use vm_topology::processor::aarch64::GicMsiController;
+            use vm_topology::processor::aarch64::GicV2mInfo;
+
+            // Resolve ITS vs v2m and determine v2m SPI count.
+            let arch_config = match &cfg.processor_topology.arch {
+                Some(ArchTopologyConfig::Aarch64(a)) => a,
+                _ => &Aarch64TopologyConfig::default(),
+            };
+            let is_gicv2 = match &arch_config.gic_config {
+                Some(GicConfig::V2(_)) => true,
+                _ => !platform_info.supports_gic_v3,
+            };
+            let v2m_spi_count = match &arch_config.gic_msi {
+                GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None,
+                GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT),
+                GicMsiConfig::Its => {
+                    if is_gicv2 {
+                        anyhow::bail!("ITS is incompatible with GICv2");
                     }
-                    GicMsiConfig::V2m { spi_count } => Some(*spi_count),
-                };
-
-                // Resolve SPI layout — all SPI allocations in one deterministic pass.
-                let spi_layout = super::spi_layout::resolve_spi_layout(
-                    &super::spi_layout::SpiLayoutInput {
-                        v2m_spi_count,
-                    },
-                )?;
+                    if !platform_info.supports_its {
+                        anyhow::bail!("ITS requested but the hypervisor does not support it");
+                    }
+                    None
+                }
+                GicMsiConfig::V2m { spi_count } => Some(*spi_count),
+            };
 
-                // Build the GIC MSI controller from resolved SPIs.
-                let gic_msi = if let Some(count) = v2m_spi_count {
-                    GicMsiController::V2m(GicV2mInfo {
-                        frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
-                        spi_base: spi_layout.v2m_spi_base.expect("v2m base must be allocated when v2m_spi_count is Some"),
-                        spi_count: count,
-                    })
-                } else {
-                    GicMsiController::Its(GicItsInfo {
-                        its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
-                    })
-                };
+            // Resolve SPI layout — all SPI allocations in one deterministic pass.
+            let spi_layout =
+                super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput {
+                    v2m_spi_count,
+                })?;
 
-                let processor_topology =
-                    build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)?;
+            // Build the GIC MSI controller from resolved SPIs.
+            let gic_msi = if let Some(count) = v2m_spi_count {
+                GicMsiController::V2m(GicV2mInfo {
+                    frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
+                    spi_base: spi_layout
+                        .v2m_spi_base
+                        .expect("v2m base must be allocated when v2m_spi_count is Some"),
+                    spi_count: count,
+                })
             } else {
-                let processor_topology =
-                    build_x86_topology(&cfg.processor_topology)?;
-            }
-        }
+                GicMsiController::Its(GicItsInfo {
+                    its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
+                })
+            };
+
+            build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)?
+        };
+        #[cfg(not(guest_arch = "aarch64"))]
+        let processor_topology = build_x86_topology(&cfg.processor_topology)?;
 
         let proto = hypervisor
             .new_partition(virt::ProtoPartitionConfig {

From 358071a96d3e3c9ae1469b4108f6beacaed401e5 Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 14:51:55 -0700
Subject: [PATCH 3/7] feedback

---
 openvmm/openvmm_core/src/worker/dispatch.rs   | 111 +++++++++---------
 openvmm/openvmm_core/src/worker/spi_layout.rs |   1 +
 openvmm/openvmm_defs/src/config.rs            |   8 +-
 openvmm/openvmm_entry/src/lib.rs              |   6 +-
 4 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs
index 7601fe164f..1e9981e57c 100644
--- a/openvmm/openvmm_core/src/worker/dispatch.rs
+++ b/openvmm/openvmm_core/src/worker/dispatch.rs
@@ -499,13 +499,25 @@ impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
     }
 }
 
+#[cfg(guest_arch = "aarch64")]
+struct Aarch64TopologyResult {
+    processor_topology: ProcessorTopology<Aarch64Topology>,
+    #[expect(dead_code)] // consumed by SMMU device wiring
+    spi_layout: super::spi_layout::ResolvedSpiLayout,
+}
+
 #[cfg(guest_arch = "aarch64")]
 fn build_aarch64_topology(
     config: &ProcessorTopologyConfig,
     platform_info: &virt::PlatformInfo,
-    gic_msi: vm_topology::processor::aarch64::GicMsiController,
-) -> anyhow::Result<ProcessorTopology<Aarch64Topology>> {
+) -> anyhow::Result<Aarch64TopologyResult> {
+    use openvmm_defs::config::GicMsiConfig;
     use vm_topology::processor::aarch64::Aarch64PlatformConfig;
+    use vm_topology::processor::aarch64::GicItsInfo;
+    use vm_topology::processor::aarch64::GicMsiController;
+    use vm_topology::processor::aarch64::GicV2mInfo;
+
+    const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64;
 
     let arch = match &config.arch {
         None => Default::default(),
@@ -582,6 +594,43 @@ fn build_aarch64_topology(
         }
     };
 
+    // Resolve ITS vs v2m and determine v2m SPI count.
+    let is_gicv2 = matches!(gic_version, GicVersion::V2 { .. });
+    let v2m_spi_count = match &arch.gic_msi {
+        GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None,
+        GicMsiConfig::Auto => Some(DEFAULT_GIC_V2M_SPI_COUNT),
+        GicMsiConfig::Its => {
+            if is_gicv2 {
+                anyhow::bail!("ITS is incompatible with GICv2");
+            }
+            if !platform_info.supports_its {
+                anyhow::bail!("ITS requested but the hypervisor does not support it");
+            }
+            None
+        }
+        GicMsiConfig::V2m { spi_count } => Some(spi_count.unwrap_or(DEFAULT_GIC_V2M_SPI_COUNT)),
+    };
+
+    // Resolve SPI layout — all SPI allocations in one deterministic pass.
+    let spi_layout = super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput {
+        v2m_spi_count,
+    })?;
+
+    // Build the GIC MSI controller from resolved SPIs.
+    let gic_msi = if let Some(count) = v2m_spi_count {
+        GicMsiController::V2m(GicV2mInfo {
+            frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
+            spi_base: spi_layout
+                .v2m_spi_base
+                .expect("v2m base must be allocated when v2m_spi_count is Some"),
+            spi_count: count,
+        })
+    } else {
+        GicMsiController::Its(GicItsInfo {
+            its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
+        })
+    };
+
     let platform = Aarch64PlatformConfig {
         gic_distributor_base,
         gic_version,
@@ -600,7 +649,10 @@ fn build_aarch64_topology(
     } else {
         builder.vps_per_socket(config.proc_count);
     }
-    Ok(builder.build(config.proc_count)?)
+    Ok(Aarch64TopologyResult {
+        processor_topology: builder.build(config.proc_count)?,
+        spi_layout,
+    })
 }
 
 /// A VM that has been loaded and can be run.
@@ -823,57 +875,8 @@ impl InitializedVm {
 
         #[cfg(guest_arch = "aarch64")]
         let processor_topology = {
-            use openvmm_defs::config::GicMsiConfig;
-            use vm_topology::processor::aarch64::GicItsInfo;
-            use vm_topology::processor::aarch64::GicMsiController;
-            use vm_topology::processor::aarch64::GicV2mInfo;
-
-            // Resolve ITS vs v2m and determine v2m SPI count.
-            let arch_config = match &cfg.processor_topology.arch {
-                Some(ArchTopologyConfig::Aarch64(a)) => a,
-                _ => &Aarch64TopologyConfig::default(),
-            };
-            let is_gicv2 = match &arch_config.gic_config {
-                Some(GicConfig::V2(_)) => true,
-                _ => !platform_info.supports_gic_v3,
-            };
-            let v2m_spi_count = match &arch_config.gic_msi {
-                GicMsiConfig::Auto if platform_info.supports_its && !is_gicv2 => None,
-                GicMsiConfig::Auto => Some(openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT),
-                GicMsiConfig::Its => {
-                    if is_gicv2 {
-                        anyhow::bail!("ITS is incompatible with GICv2");
-                    }
-                    if !platform_info.supports_its {
-                        anyhow::bail!("ITS requested but the hypervisor does not support it");
-                    }
-                    None
-                }
-                GicMsiConfig::V2m { spi_count } => Some(*spi_count),
-            };
-
-            // Resolve SPI layout — all SPI allocations in one deterministic pass.
-            let spi_layout =
-                super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput {
-                    v2m_spi_count,
-                })?;
-
-            // Build the GIC MSI controller from resolved SPIs.
-            let gic_msi = if let Some(count) = v2m_spi_count {
-                GicMsiController::V2m(GicV2mInfo {
-                    frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
-                    spi_base: spi_layout
-                        .v2m_spi_base
-                        .expect("v2m base must be allocated when v2m_spi_count is Some"),
-                    spi_count: count,
-                })
-            } else {
-                GicMsiController::Its(GicItsInfo {
-                    its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
-                })
-            };
-
-            build_aarch64_topology(&cfg.processor_topology, &platform_info, gic_msi)?
+            let result = build_aarch64_topology(&cfg.processor_topology, &platform_info)?;
+            result.processor_topology
         };
         #[cfg(not(guest_arch = "aarch64"))]
         let processor_topology = build_x86_topology(&cfg.processor_topology)?;
diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs
index 97065590e8..7b468741e8 100644
--- a/openvmm/openvmm_core/src/worker/spi_layout.rs
+++ b/openvmm/openvmm_core/src/worker/spi_layout.rs
@@ -31,6 +31,7 @@ impl SpiAllocator {
     }
 
     /// Allocates a single SPI, returning its GIC INTID.
+    #[expect(dead_code)] // used when SMMU instances are configured
     fn alloc(&mut self, tag: &str) -> anyhow::Result<u32> {
         if self.cursor < self.range_start {
             anyhow::bail!("SPI exhausted allocating {tag}");
diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs
index c2144a8216..1ccf871307 100644
--- a/openvmm/openvmm_defs/src/config.rs
+++ b/openvmm/openvmm_defs/src/config.rs
@@ -80,9 +80,6 @@ pub const DEFAULT_GIC_V2M_MSI_FRAME_BASE: u64 = 0xEFFE_8000;
 /// Size of the v2m MSI frame (one 4KB page is the architectural minimum).
 pub const GIC_V2M_MSI_FRAME_SIZE: u64 = 0x1000;
 
-/// Default number of SPIs reserved for PCIe MSIs when using GICv2m.
-pub const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64;
-
 /// Base address of the GICv3 ITS MMIO region. Must be 64 KiB aligned,
 /// below the v2m frame address, and not overlap other devices.
 /// The region extends from this base to base + GIC_ITS_SIZE (128 KiB).
@@ -294,8 +291,9 @@ pub enum GicMsiConfig {
     Its,
     /// Force GICv2m for MSI delivery via SPIs.
     V2m {
-        /// Number of SPIs to reserve for PCIe MSIs.
-        spi_count: u32,
+        /// Number of SPIs to reserve for PCIe MSIs. Defaults to a
+        /// platform-specific value when `None`.
+        spi_count: Option<u32>,
     },
 }
 
diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs
index 542253cb3f..fbd1c888de 100644
--- a/openvmm/openvmm_entry/src/lib.rs
+++ b/openvmm/openvmm_entry/src/lib.rs
@@ -1285,9 +1285,9 @@ async fn vm_config_from_command_line(
             gic_msi: match opt.gic_msi {
                 cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto,
                 cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its,
-                cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m {
-                    spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT,
-                },
+                cli_args::GicMsiCli::V2m => {
+                    openvmm_defs::config::GicMsiConfig::V2m { spi_count: None }
+                }
             },
         },
     );

From 082f0cb8f51f034b405f8e6d888a63f2b4a14aef Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 20:52:45 -0700
Subject: [PATCH 4/7] fix

---
 openvmm/openvmm_core/src/worker/dispatch.rs   |  4 +++-
 openvmm/openvmm_core/src/worker/spi_layout.rs | 23 ++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs
index 1e9981e57c..e82da2ae46 100644
--- a/openvmm/openvmm_core/src/worker/dispatch.rs
+++ b/openvmm/openvmm_core/src/worker/dispatch.rs
@@ -612,7 +612,9 @@ fn build_aarch64_topology(
     };
 
     // Resolve SPI layout — all SPI allocations in one deterministic pass.
+    let gic_nr_irqs = openvmm_defs::config::DEFAULT_GIC_NR_IRQS;
     let spi_layout = super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput {
+        gic_nr_irqs,
         v2m_spi_count,
     })?;
 
@@ -637,7 +639,7 @@ fn build_aarch64_topology(
         gic_msi,
         pmu_gsiv,
         virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI,
-        gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS,
+        gic_nr_irqs,
     };
 
     let mut builder = TopologyBuilder::new_aarch64(platform);
diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs
index 7b468741e8..cf93b62a0d 100644
--- a/openvmm/openvmm_core/src/worker/spi_layout.rs
+++ b/openvmm/openvmm_core/src/worker/spi_layout.rs
@@ -12,9 +12,10 @@
 //! This is critical for hibernation — a resumed VM must get the same SPI
 //! layout as the original.
 //!
-//! SPIs are allocated top-down from INTID 1019. This maximizes distance from
-//! the guest-side vPCI MSI allocator (Hyper-V PCI driver in Linux), which
-//! allocates bottom-up starting at INTID 64.
+//! SPIs are allocated top-down from the highest SPI supported by the GIC
+//! (determined by `gic_nr_irqs`). This maximizes distance from the guest-side
+//! vPCI MSI allocator (Hyper-V PCI driver in Linux), which allocates bottom-up
+//! starting at INTID 64.
 
 /// Top-down GIC SPI allocator.
 struct SpiAllocator {
@@ -58,6 +59,9 @@ impl SpiAllocator {
 
 /// Inputs to the SPI layout resolver.
 pub(super) struct SpiLayoutInput {
+    /// Total number of GIC interrupt lines (INTIDs 0..gic_nr_irqs-1).
+    /// Determines the highest usable SPI.
+    pub gic_nr_irqs: u32,
     /// Number of SPIs to reserve for GICv2m MSI delivery. `None` when using
     /// ITS (no v2m block needed).
     pub v2m_spi_count: Option<u32>,
@@ -72,11 +76,12 @@ pub(super) struct ResolvedSpiLayout {
 /// Resolves SPI assignments for all platform devices.
 ///
 /// All allocations happen here in a single top-down pass over the SPI range
-/// `[64, 1019]`. The order of allocations determines the layout and must not
-/// change across OpenVMM versions for a given config, or hibernation will
-/// break.
+/// `[64, gic_nr_irqs-1]`. The order of allocations determines the layout and
+/// must not change across OpenVMM versions for a given config, or hibernation
+/// will break.
 pub(super) fn resolve_spi_layout(input: &SpiLayoutInput) -> anyhow::Result<ResolvedSpiLayout> {
-    let mut spi = SpiAllocator::new(64..=1019);
+    let max_intid = input.gic_nr_irqs.saturating_sub(1).min(1019);
+    let mut spi = SpiAllocator::new(64..=max_intid);
 
     // --- Allocation order (do not reorder!) ---
 
@@ -96,16 +101,18 @@ mod tests {
     #[test]
     fn v2m_allocation() {
         let result = resolve_spi_layout(&SpiLayoutInput {
+            gic_nr_irqs: 992,
             v2m_spi_count: Some(64),
         })
         .unwrap();
 
-        assert_eq!(result.v2m_spi_base, Some(956));
+        assert_eq!(result.v2m_spi_base, Some(928));
     }
 
     #[test]
     fn its_skips_v2m() {
         let result = resolve_spi_layout(&SpiLayoutInput {
+            gic_nr_irqs: 992,
             v2m_spi_count: None,
         })
         .unwrap();

From 03c51c118d1a4f042be9308a42e4e5711e83fef2 Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 22:24:55 -0700
Subject: [PATCH 5/7] fix

---
 openvmm/openvmm_core/src/worker/spi_layout.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs
index cf93b62a0d..87ed7d1031 100644
--- a/openvmm/openvmm_core/src/worker/spi_layout.rs
+++ b/openvmm/openvmm_core/src/worker/spi_layout.rs
@@ -20,6 +20,8 @@
 /// Top-down GIC SPI allocator.
 struct SpiAllocator {
     range_start: u32,
+    /// One past the last allocated INTID, or `range_end + 1` when nothing
+    /// has been allocated yet.
     cursor: u32,
 }
 
@@ -27,33 +29,31 @@ impl SpiAllocator {
     fn new(range: std::ops::RangeInclusive<u32>) -> Self {
         Self {
             range_start: *range.start(),
-            cursor: *range.end(),
+            cursor: *range.end() + 1,
         }
     }
 
     /// Allocates a single SPI, returning its GIC INTID.
     #[expect(dead_code)] // used when SMMU instances are configured
     fn alloc(&mut self, tag: &str) -> anyhow::Result<u32> {
-        if self.cursor < self.range_start {
+        if self.cursor <= self.range_start {
             anyhow::bail!("SPI exhausted allocating {tag}");
         }
-        let intid = self.cursor;
         self.cursor -= 1;
-        Ok(intid)
+        Ok(self.cursor)
     }
 
     /// Allocates a contiguous block of `count` SPIs, returning the lowest
     /// GIC INTID in the block.
     fn alloc_block(&mut self, tag: &str, count: u32) -> anyhow::Result<u32> {
-        let available = self.cursor.saturating_sub(self.range_start) + 1;
-        if count == 0 || count > available {
+        let available = self.cursor - self.range_start;
+        if count > available {
             anyhow::bail!(
                 "SPI exhausted allocating {tag}: need {count}, only {available} remaining"
             );
         }
-        let base = self.cursor - count + 1;
-        self.cursor = base - 1;
-        Ok(base)
+        self.cursor -= count;
+        Ok(self.cursor)
     }
 }
 

From 3ab0cca3516d8b1ee2cd5d796639f5109786a41f Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 14:27:57 -0700
Subject: [PATCH 6/7] smmu

---
 Cargo.lock                                    |   19 +
 Cargo.toml                                    |    1 +
 openhcl/underhill_core/src/loader/mod.rs      |    1 +
 openvmm/openvmm_core/Cargo.toml               |    1 +
 openvmm/openvmm_core/src/worker/dispatch.rs   |  365 ++-
 .../openvmm_core/src/worker/memory_layout.rs  |   25 +
 openvmm/openvmm_core/src/worker/spi_layout.rs |   47 +-
 .../src/worker/vm_loaders/linux.rs            |   49 +
 openvmm/openvmm_defs/src/config.rs            |   14 +
 openvmm/openvmm_entry/src/cli_args.rs         |    5 +
 openvmm/openvmm_entry/src/lib.rs              |    8 +
 petri/src/vm/openvmm/modify.rs                |   29 +
 vm/acpi_spec/src/iort.rs                      |   98 +
 vm/devices/iommu/smmu/Cargo.toml              |   24 +
 vm/devices/iommu/smmu/src/emulator.rs         | 2139 +++++++++++++++++
 vm/devices/iommu/smmu/src/lib.rs              |   19 +
 vm/devices/iommu/smmu/src/shared.rs           | 1371 +++++++++++
 vm/devices/iommu/smmu/src/spec/cd.rs          |  445 ++++
 vm/devices/iommu/smmu/src/spec/commands.rs    |  299 +++
 vm/devices/iommu/smmu/src/spec/events.rs      |  265 ++
 vm/devices/iommu/smmu/src/spec/mod.rs         |   17 +
 vm/devices/iommu/smmu/src/spec/pt.rs          |  396 +++
 vm/devices/iommu/smmu/src/spec/registers.rs   |  708 ++++++
 vm/devices/iommu/smmu/src/spec/ste.rs         |  309 +++
 vm/devices/iommu/smmu/src/translate.rs        | 1046 ++++++++
 vm/devices/pci/pci_core/src/bus_range.rs      |    6 +-
 vmm_core/src/acpi_builder.rs                  |  384 ++-
 .../vmm_tests/tests/tests/multiarch/pcie.rs   |  108 +
 28 files changed, 8100 insertions(+), 98 deletions(-)
 create mode 100644 vm/devices/iommu/smmu/Cargo.toml
 create mode 100644 vm/devices/iommu/smmu/src/emulator.rs
 create mode 100644 vm/devices/iommu/smmu/src/lib.rs
 create mode 100644 vm/devices/iommu/smmu/src/shared.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/cd.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/commands.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/events.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/mod.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/pt.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/registers.rs
 create mode 100644 vm/devices/iommu/smmu/src/spec/ste.rs
 create mode 100644 vm/devices/iommu/smmu/src/translate.rs

diff --git a/Cargo.lock b/Cargo.lock
index 43ff006ebf..0fec691590 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5334,6 +5334,7 @@ dependencies = [
  "scsi_core",
  "scsidisk",
  "serial_16550_resources",
+ "smmu",
  "sparse_mmap",
  "state_unit",
  "storvsp",
@@ -7293,6 +7294,24 @@ version = "1.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
 
+[[package]]
+name = "smmu"
+version = "0.0.0"
+dependencies = [
+ "anyhow",
+ "bitfield-struct 0.11.0",
+ "chipset_device",
+ "guestmem",
+ "inspect",
+ "open_enum",
+ "pal_event",
+ "parking_lot",
+ "pci_core",
+ "tracelimit",
+ "vmcore",
+ "zerocopy",
+]
+
 [[package]]
 name = "smoltcp"
 version = "0.12.0"
diff --git a/Cargo.toml b/Cargo.toml
index 98dfb3c210..22bf7a59d4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -290,6 +290,7 @@ pci_bus = { path = "vm/devices/pci/pci_bus" }
 pci_core = { path = "vm/devices/pci/pci_core" }
 pci_resources = { path = "vm/devices/pci/pci_resources" }
 pcie = { path = "vm/devices/pci/pcie" }
+smmu = { path = "vm/devices/iommu/smmu" }
 vpci = { path = "vm/devices/pci/vpci" }
 vpci_client = { path = "vm/devices/pci/vpci_client" }
 vpci_protocol = { path = "vm/devices/pci/vpci_protocol" }
diff --git a/openhcl/underhill_core/src/loader/mod.rs b/openhcl/underhill_core/src/loader/mod.rs
index 9182fe02d9..f737834ecc 100644
--- a/openhcl/underhill_core/src/loader/mod.rs
+++ b/openhcl/underhill_core/src/loader/mod.rs
@@ -492,6 +492,7 @@ pub fn write_uefi_config(
                 // Not used for MADT/SRAT generation; only matters for FADT.
                 hypervisor_vendor_identity: 0,
                 virt_timer_ppi: processor_topology.virt_timer_ppi(),
+                smmu: Vec::new(),
             },
         };
 
diff --git a/openvmm/openvmm_core/Cargo.toml b/openvmm/openvmm_core/Cargo.toml
index 2adf4460a3..2afde03497 100644
--- a/openvmm/openvmm_core/Cargo.toml
+++ b/openvmm/openvmm_core/Cargo.toml
@@ -70,6 +70,7 @@ pci_bus.workspace = true
 pci_core.workspace = true
 pci_resources.workspace = true
 pcie.workspace = true
+smmu.workspace = true
 scsi_core.workspace = true
 scsidisk.workspace = true
 serial_16550_resources.workspace = true
diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs
index e82da2ae46..a1a9044f16 100644
--- a/openvmm/openvmm_core/src/worker/dispatch.rs
+++ b/openvmm/openvmm_core/src/worker/dispatch.rs
@@ -407,11 +407,22 @@ pub(crate) struct InitializedVm {
     chipset_low_mmio: MemoryRange,
     chipset_high_mmio: MemoryRange,
     vtl2_chipset_mmio: MemoryRange,
+    resolved_smmu_resources: Vec<ResolvedSmmuResources>,
     processor_topology: ProcessorTopology,
     igvm_file: Option<IgvmFile>,
     driver_source: VmTaskDriverSource,
 }
 
+/// Resolved resources for a single SMMUv3 instance.
+struct ResolvedSmmuResources {
+    /// MMIO base address (from the memory layout allocator).
+    base: u64,
+    /// GIC INTID for the event queue interrupt (from the SPI allocator).
+    evtq_gsiv: u32,
+    /// GIC INTID for the global error interrupt (from the SPI allocator).
+    gerr_gsiv: u32,
+}
+
 trait ExtractTopologyConfig {
     fn to_config(&self) -> ProcessorTopologyConfig;
 }
@@ -494,6 +505,7 @@ impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
                     None => PmuGsivConfig::Disabled,
                 },
                 gic_msi: Default::default(),
+                smmu: Vec::new(),
             })),
         }
     }
@@ -502,7 +514,6 @@ impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
 #[cfg(guest_arch = "aarch64")]
 struct Aarch64TopologyResult {
     processor_topology: ProcessorTopology<Aarch64Topology>,
-    #[expect(dead_code)] // consumed by SMMU device wiring
     spi_layout: super::spi_layout::ResolvedSpiLayout,
 }
 
@@ -616,6 +627,7 @@ fn build_aarch64_topology(
     let spi_layout = super::spi_layout::resolve_spi_layout(&super::spi_layout::SpiLayoutInput {
         gic_nr_irqs,
         v2m_spi_count,
+        smmu_count: arch.smmu.len(),
     })?;
 
     // Build the GIC MSI controller from resolved SPIs.
@@ -730,6 +742,12 @@ struct LoadedVmInner {
     automatic_guest_reset: bool,
     pcie_host_bridges: Vec<PcieHostBridge>,
     pcie_root_complexes: Vec<Arc<closeable_mutex::CloseableMutex<GenericPcieRootComplex>>>,
+    /// SMMU configurations, one per instance.
+    #[cfg_attr(not(guest_arch = "aarch64"), expect(dead_code))]
+    smmu_configs: Vec<vmm_core::acpi_builder::AcpiSmmuConfig>,
+    /// Per-RC SMMU shared state, indexed parallel to `pcie_host_bridges`.
+    /// `None` for root complexes without an SMMU.
+    smmu_shared_states: Vec<Option<Arc<smmu::SmmuSharedState>>>,
     pcie_hotplug_devices: Vec<(
         String,
         vmotherboard::DynamicDeviceUnit,
@@ -876,9 +894,9 @@ impl InitializedVm {
         };
 
         #[cfg(guest_arch = "aarch64")]
-        let processor_topology = {
+        let (processor_topology, spi_layout) = {
             let result = build_aarch64_topology(&cfg.processor_topology, &platform_info)?;
-            result.processor_topology
+            (result.processor_topology, result.spi_layout)
         };
         #[cfg(not(guest_arch = "aarch64"))]
         let processor_topology = build_x86_topology(&cfg.processor_topology)?;
@@ -938,12 +956,30 @@ impl InitializedVm {
             .filter(|(bus, _)| matches!(bus, VirtioBus::Mmio))
             .count();
 
+        // Count SMMU instances so the layout engine can allocate their MMIO.
+        let smmu_count = {
+            #[cfg(guest_arch = "aarch64")]
+            {
+                match &cfg.processor_topology.arch {
+                    Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig { smmu, .. })) => {
+                        smmu.len()
+                    }
+                    _ => 0,
+                }
+            }
+            #[cfg(not(guest_arch = "aarch64"))]
+            {
+                0
+            }
+        };
+
         let resolved_layout = resolve_memory_layout(MemoryLayoutInput {
             mem_size: cfg.memory.mem_size,
             numa_mem_sizes: cfg.memory.numa_mem_sizes.as_deref(),
             layout: cfg.layout.clone(),
             pcie_root_complexes: &cfg.pcie_root_complexes,
             virtio_mmio_count,
+            smmu_count,
             vtl2_layout,
             physical_address_size,
         })
@@ -955,6 +991,23 @@ impl InitializedVm {
         let chipset_high_mmio = resolved_layout.chipset_high_mmio;
         let vtl2_chipset_mmio = resolved_layout.vtl2_chipset_mmio;
 
+        // Combine SMMU MMIO ranges with SPI layout.
+        cfg_if! {
+            if #[cfg(guest_arch = "aarch64")] {
+                let resolved_smmu_resources: Vec<ResolvedSmmuResources> = resolved_layout.smmu_ranges
+                    .iter()
+                    .zip(&spi_layout.smmu)
+                    .map(|(range, spis)| ResolvedSmmuResources {
+                        base: range.start(),
+                        evtq_gsiv: spis.evtq_gsiv,
+                        gerr_gsiv: spis.gerr_gsiv,
+                    })
+                    .collect();
+            } else {
+                let resolved_smmu_resources: Vec<ResolvedSmmuResources> = Vec::new();
+            }
+        }
+
         // Place the alias map at the end of the address space. Newer versions
         // of OpenHCL support receiving this offset via devicetree (especially
         // important on ARM64 where the physical address width used here is not
@@ -1070,6 +1123,7 @@ impl InitializedVm {
             chipset_low_mmio,
             chipset_high_mmio,
             vtl2_chipset_mmio,
+            resolved_smmu_resources,
             processor_topology,
             igvm_file,
             driver_source,
@@ -1101,6 +1155,7 @@ impl InitializedVm {
             chipset_low_mmio,
             chipset_high_mmio,
             vtl2_chipset_mmio,
+            resolved_smmu_resources,
             processor_topology,
             igvm_file,
             driver_source,
@@ -1789,8 +1844,10 @@ impl InitializedVm {
         #[cfg(not(guest_arch = "aarch64"))]
         let use_its = false;
 
-        let (pcie_host_bridges, pcie_root_complexes) = {
+        let (pcie_host_bridges, pcie_rc_name_to_idx, pcie_root_complexes) = {
             let mut pcie_host_bridges = Vec::new();
+            let mut pcie_rc_name_to_idx: std::collections::HashMap<String, usize> =
+                std::collections::HashMap::new();
             let mut pcie_root_complexes = Vec::new();
 
             for (rc, ranges) in cfg
@@ -1859,13 +1916,14 @@ impl InitializedVm {
                     high_mmio: ranges.high_mmio,
                 });
 
+                pcie_rc_name_to_idx.insert(rc.name.clone(), pcie_host_bridges.len() - 1);
                 pcie_root_complexes.push(root_complex.clone());
 
                 let bus_id = vmotherboard::BusId::new(&rc.name);
                 chipset_builder.register_weak_mutex_pcie_enumerator(bus_id, Box::new(root_complex));
             }
 
-            (pcie_host_bridges, pcie_root_complexes)
+            (pcie_host_bridges, pcie_rc_name_to_idx, pcie_root_complexes)
         };
 
         // Build a port-name→(segment, bus_range) map covering all ports in
@@ -1984,44 +2042,118 @@ impl InitializedVm {
             Some(handle)
         };
 
+        // Determine which SMMU instances to create. When active, PCIe devices
+        // on the covered root complexes get translating GuestMemory and
+        // SignalMsi wrappers that route DMA and MSI writes through the
+        // emulated SMMUv3. Each SMMU instance covers one root complex.
+        #[cfg(guest_arch = "aarch64")]
+        let smmu_instances: Vec<openvmm_defs::config::SmmuInstanceConfig> = match &cfg
+            .processor_topology
+            .arch
+        {
+            Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig { smmu, .. })) => smmu.clone(),
+            _ => Vec::new(),
+        };
+        #[cfg(not(guest_arch = "aarch64"))]
+        let smmu_instances: Vec<openvmm_defs::config::SmmuInstanceConfig> = Vec::new();
+
+        // When SMMU instances are configured, instantiate a device for each
+        // and build lookup maps:
+        // - rc_name → shared_state (for per-device wiring and hotplug)
+        // - port_name → shared_state (for the per-device loop below)
+        //
+        // SPI assignments come from the SPI allocator via smmu_gsivs.
+        // IRQ_LINE_SET vectors are GSIV - 32 (the GIC target offset).
+        let mut smmu_shared_states: Vec<Option<Arc<smmu::SmmuSharedState>>> =
+            vec![None; pcie_host_bridges.len()];
+        let mut smmu_configs = Vec::new();
+        for (idx, inst) in smmu_instances.iter().enumerate() {
+            // Look up the RC by name to get its index.
+            let rc_pos = match pcie_rc_name_to_idx.get(&inst.rc_name) {
+                Some(&i) => i,
+                None => {
+                    anyhow::bail!(
+                        "SMMU instance references unknown root complex {:?}",
+                        inst.rc_name
+                    );
+                }
+            };
+            let smmu = &resolved_smmu_resources[idx];
+            let evtq_irq_vector = smmu.evtq_gsiv - *vmm_core::emuplat::gic::SPI_RANGE.start();
+            let gerror_irq_vector = smmu.gerr_gsiv - *vmm_core::emuplat::gic::SPI_RANGE.start();
+            let device_name = format!("smmu:{}", inst.rc_name);
+            let smmu_config = smmu::SmmuConfig {
+                sid_bits: 16,
+                oas: 0b010,
+            };
+            let smmu_device =
+                chipset_builder
+                    .arc_mutex_device(device_name.as_str())
+                    .add(|services| {
+                        let evtq_irq = services.new_line(IRQ_LINE_SET, "evtq", evtq_irq_vector);
+                        let gerror_irq =
+                            services.new_line(IRQ_LINE_SET, "gerror", gerror_irq_vector);
+                        smmu::SmmuDevice::new(
+                            smmu.base,
+                            gm.clone(),
+                            &smmu_config,
+                            Some(evtq_irq),
+                            Some(gerror_irq),
+                        )
+                    })?;
+            smmu_shared_states[rc_pos] = Some(smmu_device.lock().shared_state().clone());
+            smmu_configs.push(vmm_core::acpi_builder::AcpiSmmuConfig {
+                rc_index: pcie_host_bridges[rc_pos].index,
+                segment: pcie_host_bridges[rc_pos].segment,
+                base: smmu.base,
+                event_gsiv: smmu.evtq_gsiv,
+                gerr_gsiv: smmu.gerr_gsiv,
+            });
+        }
+
+        // Build a port-name → SMMU shared state map. Each downstream port of
+        // an SMMU-covered root complex inherits that SMMU.
+        let smmu_port_map: std::collections::HashMap<Arc<str>, Arc<smmu::SmmuSharedState>> =
+            smmu_shared_states
+                .iter()
+                .zip(pcie_root_complexes.iter())
+                .flat_map(|(shared, rc)| {
+                    let shared = shared.clone();
+                    rc.lock()
+                        .downstream_ports()
+                        .into_iter()
+                        .filter_map(move |dpi| shared.as_ref().map(|s| (dpi.name, s.clone())))
+                })
+                .collect();
+
+        // Track which RCs have SMMUs (for VFIO blocking).
+        let mut smmu_per_rc = vec![false; pcie_host_bridges.len()];
+        for inst in &smmu_instances {
+            if let Some(&i) = pcie_rc_name_to_idx.get(&inst.rc_name) {
+                smmu_per_rc[i] = true;
+            }
+        }
+
+        // Build port-name set for ports behind SMMUs.
+        let smmu_s1_ports: std::collections::HashSet<Arc<str>> = smmu_per_rc
+            .iter()
+            .zip(pcie_root_complexes.iter())
+            .flat_map(|(&has_smmu, rc)| {
+                rc.lock()
+                    .downstream_ports()
+                    .into_iter()
+                    .filter_map(move |dpi| if has_smmu { Some(dpi.name) } else { None })
+            })
+            .collect();
+
         // Resolve PCIe devices concurrently.
         //
         // When ITS is active, the root complex's ITS-wrapped SignalMsi
         // and IrqFd are shared across all devices on that complex. Each
         // device's MsiConnection carries a default BDF derived from the
         // port's AssignedBusRange, which the MsiTarget resolves lazily
-        // at interrupt delivery time.
-
-        // Build per-segment ITS-wrapped signal_msi and irqfd. Each root
-        // complex connection already has ITS wrapping for port MSIs; we
-        // share the same wrapped instances for child devices.
-        let its_signal_msi: std::collections::HashMap<u16, Arc<dyn pci_core::msi::SignalMsi>> =
-            if use_its {
-                let mut map = std::collections::HashMap::new();
-                if let Some(s) = partition.as_signal_msi(Vtl::Vtl0) {
-                    for hb in &pcie_host_bridges {
-                        map.entry(hb.segment).or_insert_with(|| {
-                            Arc::new(pcie::its::ItsSignalMsi::new(s.clone(), hb.segment)) as _
-                        });
-                    }
-                }
-                map
-            } else {
-                std::collections::HashMap::new()
-            };
-        let its_irqfd: std::collections::HashMap<u16, Arc<dyn vmcore::irqfd::IrqFd>> = if use_its {
-            let mut map = std::collections::HashMap::new();
-            if let Some(fd) = partition.irqfd() {
-                for hb in &pcie_host_bridges {
-                    map.entry(hb.segment).or_insert_with(|| {
-                        Arc::new(pcie::its::ItsIrqFd::new(fd.clone(), hb.segment)) as _
-                    });
-                }
-            }
-            map
-        } else {
-            std::collections::HashMap::new()
-        };
+        // at interrupt delivery time. When SMMU is enabled, per-device
+        // wrappers translate IOVAs and MSI addresses through the emulated SMMU.
 
         try_join_all(cfg.pcie_devices.into_iter().map(|dev_cfg| {
             let chipset_builder = &chipset_builder;
@@ -2031,8 +2163,8 @@ impl InitializedVm {
             let partition = &partition;
             let mapper = &mapper;
             let port_info = &port_info;
-            let its_signal_msi = &its_signal_msi;
-            let its_irqfd = &its_irqfd;
+            let smmu_port_map = &smmu_port_map;
+            let smmu_s1_ports = &smmu_s1_ports;
             async move {
                 let port_name: Arc<str> = dev_cfg.port_name.into();
                 let pi = port_info.get(&port_name).ok_or_else(|| {
@@ -2042,14 +2174,39 @@ impl InitializedVm {
                     )
                 })?;
 
+                // Block VFIO devices behind S1-capable SMMUs. The
+                // emulated SMMU's S1 page tables are not programmed
+                // into the host IOMMU, so VFIO DMA would bypass S1
+                // translation. This will be lifted when iommufd
+                // nested translation support is available.
+                if dev_cfg.resource.id() == "vfio" && smmu_s1_ports.contains(&port_name) {
+                    anyhow::bail!(
+                        "VFIO device on port {:?} is behind an S1-capable SMMU, \
+                         but iommufd nested translation is not available. \
+                         Either place the device on a root complex without an SMMU, \
+                         configure the SMMU for S2-only mode (--smmu <rc>,s2-only), \
+                         or enable iommufd nested translation.",
+                        &*port_name,
+                    );
+                }
+
                 let msi_conn = pci_core::msi::MsiConnection::new(pi.bus_range.clone(), 0);
 
+                let (dev_gm, signal_msi, irqfd) = build_pcie_msi_context(
+                    partition.as_ref(),
+                    gm,
+                    &pi.bus_range,
+                    pi.segment,
+                    use_its,
+                    smmu_port_map.get(&port_name),
+                );
+
                 vmm_core::device_builder::build_pcie_device(
                     chipset_builder,
                     port_name.clone(),
                     driver_source,
                     resolver,
-                    gm,
+                    &dev_gm,
                     dev_cfg.resource,
                     partition.clone().into_doorbell_registration(Vtl::Vtl0),
                     Some(mapper),
@@ -2057,23 +2214,9 @@ impl InitializedVm {
                 )
                 .await?;
 
-                // When ITS is active, use the per-segment wrapped
-                // SignalMsi and IrqFd. The device's MsiConnection
-                // carries a default BDF from the port's bus range.
-                let signal_msi = if use_its {
-                    its_signal_msi.get(&pi.segment).cloned()
-                } else {
-                    partition.as_signal_msi(Vtl::Vtl0)
-                };
                 if let Some(target) = signal_msi {
                     msi_conn.connect(target);
                 }
-
-                let irqfd = if use_its {
-                    its_irqfd.get(&pi.segment).cloned()
-                } else {
-                    partition.irqfd()
-                };
                 if let Some(fd) = irqfd {
                     msi_conn.connect_irqfd(fd);
                 }
@@ -2567,6 +2710,8 @@ impl InitializedVm {
                 pcie_host_bridges,
                 pcie_root_complexes,
                 pcie_hotplug_devices: Vec::new(),
+                smmu_configs,
+                smmu_shared_states,
             },
         };
 
@@ -2614,6 +2759,7 @@ impl LoadedVmInner {
                     0
                 },
                 virt_timer_ppi: self.processor_topology.virt_timer_ppi(),
+                smmu: self.smmu_configs.clone(),
             },
         };
 
@@ -2709,6 +2855,7 @@ impl LoadedVmInner {
                     enable_serial,
                     &self.processor_topology,
                     &self.pcie_host_bridges,
+                    &self.smmu_configs,
                     build_acpi,
                 )?;
 
@@ -3098,7 +3245,25 @@ impl LoadedVm {
                                 .expect("port was just found above")
                                 .bus_range;
 
-                            let msi_conn = pci_core::msi::MsiConnection::new(bus_range, 0);
+                            let segment = self.inner.pcie_host_bridges[rc_idx].segment;
+                            let msi_conn = pci_core::msi::MsiConnection::new(bus_range.clone(), 0);
+
+                            #[cfg(guest_arch = "aarch64")]
+                            let use_its = matches!(
+                                self.inner.processor_topology.gic_msi(),
+                                vm_topology::processor::aarch64::GicMsiController::Its(_)
+                            );
+                            #[cfg(not(guest_arch = "aarch64"))]
+                            let use_its = false;
+
+                            let (dev_gm, signal_msi, irqfd) = build_pcie_msi_context(
+                                self.inner.partition.as_ref(),
+                                &self.inner.gm,
+                                &bus_range,
+                                segment,
+                                use_its,
+                                self.inner.smmu_shared_states[rc_idx].as_ref(),
+                            );
 
                             let (unit, device) = self.inner.chipset_devices.add_dyn_device(
                                 &self.inner.driver_source,
@@ -3112,7 +3277,7 @@ impl LoadedVm {
                                                 msi_target: msi_conn.target(),
                                                 register_mmio,
                                                 driver_source: &self.inner.driver_source,
-                                                guest_memory: &self.inner.gm,
+                                                guest_memory: &dev_gm,
                                                 doorbell_registration: self.inner.partition.clone().into_doorbell_registration(Vtl::Vtl0),
                                                 shared_mem_mapper: None,
                                             },
@@ -3123,29 +3288,13 @@ impl LoadedVm {
                                 },
                             ).await?;
 
-                            // Connect the MSI target and IrqFd, wrapping
-                            // with ITS segment translation when needed.
-                            #[cfg(guest_arch = "aarch64")]
-                            let use_its = matches!(
-                                self.inner.processor_topology.gic_msi(),
-                                vm_topology::processor::aarch64::GicMsiController::Its(_)
-                            );
-                            #[cfg(not(guest_arch = "aarch64"))]
-                            let use_its = false;
-                            let segment = self.inner.pcie_host_bridges[rc_idx].segment;
-                            if let Some(s) = self.inner.partition.as_signal_msi(Vtl::Vtl0) {
-                                if use_its {
-                                    msi_conn.connect(Arc::new(pcie::its::ItsSignalMsi::new(s, segment)));
-                                } else {
-                                    msi_conn.connect(s);
-                                }
+                            // Connect the signal_msi and irqfd (possibly
+                            // ITS-wrapped and/or SMMU-wrapped).
+                            if let Some(target) = signal_msi {
+                                msi_conn.connect(target);
                             }
-                            if let Some(fd) = self.inner.partition.irqfd() {
-                                if use_its {
-                                    msi_conn.connect_irqfd(Arc::new(pcie::its::ItsIrqFd::new(fd, segment)));
-                                } else {
-                                    msi_conn.connect_irqfd(fd);
-                                }
+                            if let Some(fd) = irqfd {
+                                msi_conn.connect_irqfd(fd);
                             }
 
                             // Wrap the device as a GenericPciBusDevice for the port.
@@ -3413,6 +3562,59 @@ impl LoadedVm {
     }
 }
 
+/// Build the layered GuestMemory, SignalMsi, and IrqFd for a PCIe device.
+///
+/// When ITS is active, wraps SignalMsi and IrqFd with segment translation.
+/// When an SMMU covers the device, additionally wraps GuestMemory, SignalMsi,
+/// and IrqFd with SMMU translation.
+fn build_pcie_msi_context(
+    partition: &dyn HvlitePartition,
+    gm: &GuestMemory,
+    bus_range: &pci_core::bus_range::AssignedBusRange,
+    segment: u16,
+    use_its: bool,
+    smmu_shared: Option<&Arc<smmu::SmmuSharedState>>,
+) -> (
+    GuestMemory,
+    Option<Arc<dyn pci_core::msi::SignalMsi>>,
+    Option<Arc<dyn vmcore::irqfd::IrqFd>>,
+) {
+    // Base signal_msi: ITS-wrapped or plain.
+    let base_signal_msi = if use_its {
+        partition.as_signal_msi(Vtl::Vtl0).map(|s| {
+            Arc::new(pcie::its::ItsSignalMsi::new(s, segment)) as Arc<dyn pci_core::msi::SignalMsi>
+        })
+    } else {
+        partition.as_signal_msi(Vtl::Vtl0)
+    };
+
+    // Base irqfd: ITS-wrapped or plain.
+    let base_irqfd = if use_its {
+        partition.irqfd().map(|fd| {
+            Arc::new(pcie::its::ItsIrqFd::new(fd, segment)) as Arc<dyn vmcore::irqfd::IrqFd>
+        })
+    } else {
+        partition.irqfd()
+    };
+
+    // When an SMMU covers this device, wrap GuestMemory and SignalMsi/IrqFd
+    // with SMMU translation. stream_id_base is 0 because each SMMU is 1:1
+    // with its root complex — stream IDs are plain BDFs.
+    if let Some(shared) = smmu_shared {
+        let (translating_gm, smmu_msi) = base_signal_msi
+            .map(|inner_msi| {
+                let (gm, msi) = shared.create_device_context(bus_range.clone(), 0, gm, inner_msi);
+                (gm, Some(msi as Arc<dyn pci_core::msi::SignalMsi>))
+            })
+            .unwrap_or_else(|| (gm.clone(), None));
+        let irqfd =
+            base_irqfd.map(|fd| shared.create_irqfd(0, fd) as Arc<dyn vmcore::irqfd::IrqFd>);
+        (translating_gm, smmu_msi, irqfd)
+    } else {
+        (gm.clone(), base_signal_msi, base_irqfd)
+    }
+}
+
 #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
 fn add_devices_to_dsdt_x64(
     mem_layout: &MemoryLayout,
@@ -3511,6 +3713,13 @@ fn add_devices_to_dsdt_arm64(
         // Always place under VMOD, not PCI0 — ARM64 doesn't use the x86
         // PCI0 DSDT node.
         dsdt.add_vmbus(false, Some(VMBUS_INTID));
+    } else if mem_layout.mmio().len() >= 2 {
+        // Even without HV enlightenments (e.g. KVM aarch64), the MMIO
+        // module is needed so the kernel knows the available MMIO address
+        // ranges for PCIe BAR allocation.
+        let low_mmio_gap = mem_layout.mmio()[0];
+        let high_mmio_gap: MemoryRange = mem_layout.mmio()[1];
+        dsdt.add_mmio_module(low_mmio_gap, high_mmio_gap);
     }
 
     if enable_serial {
diff --git a/openvmm/openvmm_core/src/worker/memory_layout.rs b/openvmm/openvmm_core/src/worker/memory_layout.rs
index 3d42ca88d2..85e87c30ff 100644
--- a/openvmm/openvmm_core/src/worker/memory_layout.rs
+++ b/openvmm/openvmm_core/src/worker/memory_layout.rs
@@ -32,6 +32,9 @@ const PAGE_SIZE: u64 = 4096;
 const TWO_MB: u64 = 2 * 1024 * 1024;
 const GB: u64 = 1024 * 1024 * 1024;
 
+/// SMMUv3 MMIO region size: two 64 KiB pages (page 0 + page 1).
+const SMMU_SIZE: u64 = 0x2_0000;
+
 /// PCIe ECAM: 32 devices * 8 functions * 4 KiB config space = 1 MB per bus.
 const PCIE_ECAM_BYTES_PER_BUS: u64 = 32 * 8 * 4096;
 
@@ -61,6 +64,10 @@ pub(super) struct ResolvedMemoryLayout {
     /// VTL2-private chipset MMIO range, reported to VTL2 VMBus via the device
     /// tree. `EMPTY` when VTL2 is not configured or has no chipset MMIO.
     pub vtl2_chipset_mmio: MemoryRange,
+    /// Resolved MMIO ranges for SMMUv3 instances, one per configured SMMU.
+    /// Each range is `SMMU_SIZE` bytes. Empty when no SMMUs are configured.
+    #[cfg_attr(not(guest_arch = "aarch64"), expect(dead_code))]
+    pub smmu_ranges: Vec<MemoryRange>,
 }
 
 #[derive(Debug)]
@@ -84,6 +91,9 @@ pub(super) struct MemoryLayoutInput<'a> {
     /// Number of virtio-mmio device slots to allocate in 32-bit MMIO space.
     /// A single contiguous region of `count * 4 KiB` is allocated.
     pub virtio_mmio_count: usize,
+    /// Number of SMMUv3 instances to allocate MMIO for. Each instance requires
+    /// `SMMU_SIZE` bytes (128 KiB), 128 KiB aligned, in 32-bit MMIO space.
+    pub smmu_count: usize,
     /// Optional IGVM VTL2 private-memory request. This is allocated after all
     /// VTL0-visible RAM and MMIO and is carried separately from ordinary RAM.
     pub vtl2_layout: Option<Vtl2MemoryLayoutRequest>,
@@ -231,6 +241,19 @@ pub(super) fn resolve_memory_layout(
         );
     }
 
+    // SMMUv3: allocate one 128 KiB region per instance. Placed below 4 GiB
+    // alongside other aarch64 system devices (GIC, ITS, PL011).
+    let mut smmu_ranges: Vec<MemoryRange> = vec![MemoryRange::EMPTY; input.smmu_count];
+    for (idx, range) in smmu_ranges.iter_mut().enumerate() {
+        builder.request(
+            format!("smmu-{idx}"),
+            range,
+            SMMU_SIZE,
+            SMMU_SIZE,
+            Placement::Mmio32,
+        );
+    }
+
     // RAM request order is part of the NUMA compatibility contract: the first
     // request maps to vnode 0, the second to vnode 1, and so on. For GB-sized
     // nodes, use GB alignment so holes do not create sub-GB RAM chunks. For
@@ -385,6 +408,7 @@ pub(super) fn resolve_memory_layout(
         chipset_low_mmio,
         chipset_high_mmio,
         vtl2_chipset_mmio,
+        smmu_ranges,
     })
 }
 
@@ -485,6 +509,7 @@ mod tests {
             layout: DEFAULT_LAYOUT,
             pcie_root_complexes: &[],
             virtio_mmio_count: 0,
+            smmu_count: 0,
             vtl2_layout,
             physical_address_size: 46,
         }
diff --git a/openvmm/openvmm_core/src/worker/spi_layout.rs b/openvmm/openvmm_core/src/worker/spi_layout.rs
index 87ed7d1031..b8ddad5074 100644
--- a/openvmm/openvmm_core/src/worker/spi_layout.rs
+++ b/openvmm/openvmm_core/src/worker/spi_layout.rs
@@ -34,7 +34,6 @@ impl SpiAllocator {
     }
 
     /// Allocates a single SPI, returning its GIC INTID.
-    #[expect(dead_code)] // used when SMMU instances are configured
     fn alloc(&mut self, tag: &str) -> anyhow::Result<u32> {
         if self.cursor <= self.range_start {
             anyhow::bail!("SPI exhausted allocating {tag}");
@@ -65,12 +64,25 @@ pub(super) struct SpiLayoutInput {
     /// Number of SPIs to reserve for GICv2m MSI delivery. `None` when using
     /// ITS (no v2m block needed).
     pub v2m_spi_count: Option<u32>,
+    /// Number of SMMUv3 instances. Each instance gets two SPIs (event queue
+    /// and global error).
+    pub smmu_count: usize,
 }
 
 /// Resolved SPI assignments for all platform devices.
 pub(super) struct ResolvedSpiLayout {
     /// GICv2m SPI base INTID. `None` when using ITS.
     pub v2m_spi_base: Option<u32>,
+    /// Per-SMMU SPI assignments, one entry per instance.
+    pub smmu: Vec<SmmuSpiAllocation>,
+}
+
+/// Allocated SPI pair for a single SMMUv3 instance.
+pub(super) struct SmmuSpiAllocation {
+    /// GIC INTID for the event queue interrupt.
+    pub evtq_gsiv: u32,
+    /// GIC INTID for the global error interrupt.
+    pub gerr_gsiv: u32,
 }
 
 /// Resolves SPI assignments for all platform devices.
@@ -91,7 +103,16 @@ pub(super) fn resolve_spi_layout(input: &SpiLayoutInput) -> anyhow::Result<Resol
         .map(|count| spi.alloc_block("gicv2m", count))
         .transpose()?;
 
-    Ok(ResolvedSpiLayout { v2m_spi_base })
+    // 2. SMMU instance SPIs (2 per instance: evtq + gerror).
+    let mut smmu = Vec::with_capacity(input.smmu_count);
+    for idx in 0..input.smmu_count {
+        smmu.push(SmmuSpiAllocation {
+            evtq_gsiv: spi.alloc(&format!("smmu{idx}-evtq"))?,
+            gerr_gsiv: spi.alloc(&format!("smmu{idx}-gerr"))?,
+        });
+    }
+
+    Ok(ResolvedSpiLayout { v2m_spi_base, smmu })
 }
 
 #[cfg(test)]
@@ -99,14 +120,19 @@ mod tests {
     use super::*;
 
     #[test]
-    fn v2m_allocation() {
+    fn v2m_then_smmu() {
         let result = resolve_spi_layout(&SpiLayoutInput {
             gic_nr_irqs: 992,
             v2m_spi_count: Some(64),
+            smmu_count: 2,
         })
         .unwrap();
 
         assert_eq!(result.v2m_spi_base, Some(928));
+        assert_eq!(result.smmu[0].evtq_gsiv, 955);
+        assert_eq!(result.smmu[0].gerr_gsiv, 954);
+        assert_eq!(result.smmu[1].evtq_gsiv, 953);
+        assert_eq!(result.smmu[1].gerr_gsiv, 952);
     }
 
     #[test]
@@ -114,9 +140,24 @@ mod tests {
         let result = resolve_spi_layout(&SpiLayoutInput {
             gic_nr_irqs: 992,
             v2m_spi_count: None,
+            smmu_count: 1,
+        })
+        .unwrap();
+
+        assert_eq!(result.v2m_spi_base, None);
+        assert_eq!(result.smmu[0].evtq_gsiv, 1019);
+        assert_eq!(result.smmu[0].gerr_gsiv, 1018);
+    }
+
+    #[test]
+    fn no_devices() {
+        let result = resolve_spi_layout(&SpiLayoutInput {
+            v2m_spi_count: None,
+            smmu_count: 0,
         })
         .unwrap();
 
         assert_eq!(result.v2m_spi_base, None);
+        assert!(result.smmu.is_empty());
     }
 }
diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs
index 4d272561c1..0099f72d12 100644
--- a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs
+++ b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs
@@ -147,6 +147,7 @@ fn build_dt(
     enable_serial: bool,
     processor_topology: &ProcessorTopology<Aarch64Topology>,
     pcie_host_bridges: &[PcieHostBridge],
+    smmu_configs: &[vmm_core::acpi_builder::AcpiSmmuConfig],
     initrd_start: u64,
     initrd_end: u64,
 ) -> Result<Vec<u8>, fdt::builder::Error> {
@@ -157,6 +158,8 @@ fn build_dt(
     const PL011_SERIAL0_IRQ: u32 = 1;
     const PL011_SERIAL1_BASE: u64 = 0xEFFEB000;
     const PL011_SERIAL1_IRQ: u32 = 2;
+    /// SMMUv3 MMIO region size: two 64 KiB pages (page 0 + page 1).
+    const SMMU_SIZE: u64 = 0x2_0000;
 
     let num_cpus = processor_topology.vps().len();
 
@@ -232,12 +235,16 @@ fn build_dt(
     let p_msi_controller = builder.add_string("msi-controller")?;
     let p_arm_msi_base_spi = builder.add_string("arm,msi-base-spi")?;
     let p_arm_msi_num_spis = builder.add_string("arm,msi-num-spis")?;
+    let p_iommu_cells = builder.add_string("#iommu-cells")?;
+    let p_iommu_map = builder.add_string("iommu-map")?;
 
     // Property handle values.
     const PHANDLE_GIC: u32 = 1;
     const PHANDLE_APB_PCLK: u32 = 2;
     const PHANDLE_V2M: u32 = 3;
     const PHANDLE_ITS: u32 = 4;
+    // SMMU phandles start at 5: SMMU instance N gets phandle 5 + N.
+    const PHANDLE_SMMU_BASE: u32 = 5;
 
     const GIC_SPI: u32 = 0;
     const GIC_PPI: u32 = 1;
@@ -362,6 +369,39 @@ fn build_dt(
         GicMsiController::None => gic_node.end_node()?,
     };
 
+    // SMMUv3 nodes (one per configured instance).
+    // Build a lookup from RC index → phandle for the iommu-map entries below.
+    let mut smmu_phandles: Vec<(u32, u32)> = Vec::new();
+    for (idx, smmu) in smmu_configs.iter().enumerate() {
+        let phandle = PHANDLE_SMMU_BASE + idx as u32;
+        smmu_phandles.push((smmu.rc_index, phandle));
+        // SPI interrupts use GIC_SPI encoding. The GSIV is the full INTID
+        // (e.g., 35), and the DT `interrupts` property wants the SPI number
+        // (INTID - 32) for GIC_SPI type.
+        let evtq_spi = smmu.event_gsiv - 32;
+        let gerr_spi = smmu.gerr_gsiv - 32;
+        root_builder = root_builder
+            .start_node(format!("smmu@{:x}", smmu.base).as_str())?
+            .add_str(p_compatible, "arm,smmu-v3")?
+            .add_u64_array(p_reg, &[smmu.base, SMMU_SIZE])?
+            .add_u32_array(
+                p_interrupts,
+                &[
+                    GIC_SPI,
+                    evtq_spi,
+                    IRQ_TYPE_EDGE_RISING,
+                    GIC_SPI,
+                    gerr_spi,
+                    IRQ_TYPE_EDGE_RISING,
+                ],
+            )?
+            .add_str_array(p_interrupt_names, &["eventq", "gerror"])?
+            .add_u32(p_iommu_cells, 1)?
+            .add_u32(p_phandle, phandle)?
+            .add_null(p_dma_coherent)?
+            .end_node()?;
+    }
+
     // ARM64 Architectural Timer.
     // The DT `interrupts` property uses the PPI offset (INTID - 16).
     assert!((16..32).contains(&processor_topology.virt_timer_ppi()));
@@ -457,6 +497,13 @@ fn build_dt(
             }
             GicMsiController::None => {}
         }
+        if let Some((_, phandle)) = smmu_phandles.iter().find(|(idx, _)| *idx == bridge.index) {
+            // iommu-map: <rid_base> <&smmu_phandle> <stream_id_base> <length>
+            // Maps the full RID range (0..0x10000) for this root complex
+            // through its SMMU instance. stream_id_base is 0 because each
+            // SMMU is 1:1 with its RC — stream IDs are plain BDFs.
+            node = node.add_u32_array(p_iommu_map, &[0, *phandle, 0, 0x10000])?;
+        }
         root_builder = node.end_node()?;
     }
 
@@ -787,6 +834,7 @@ pub fn load_linux_arm64(
     enable_serial: bool,
     processor_topology: &ProcessorTopology<Aarch64Topology>,
     pcie_host_bridges: &[PcieHostBridge],
+    smmu_configs: &[vmm_core::acpi_builder::AcpiSmmuConfig],
     build_acpi: Option<impl FnOnce(u64) -> vmm_core::acpi_builder::BuiltAcpiTables>,
 ) -> Result<Vec<Aarch64Register>, Error> {
     let mut loader = Loader::new(gm.clone(), cfg.mem_layout, hvdef::Vtl::Vtl0);
@@ -847,6 +895,7 @@ pub fn load_linux_arm64(
             enable_serial,
             processor_topology,
             pcie_host_bridges,
+            smmu_configs,
             initrd_start,
             initrd_end,
         )
diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs
index 1ccf871307..ad04398eab 100644
--- a/openvmm/openvmm_defs/src/config.rs
+++ b/openvmm/openvmm_defs/src/config.rs
@@ -297,11 +297,25 @@ pub enum GicMsiConfig {
     },
 }
 
+/// Per-instance SMMUv3 configuration for an aarch64 VM.
+///
+/// Each instance covers one PCIe root complex, identified by name.
+/// The SMMU's MMIO address is allocated dynamically by the memory layout
+/// engine.
+#[derive(Debug, Protobuf, Clone)]
+pub struct SmmuInstanceConfig {
+    /// Name of the PCIe root complex this SMMU covers.
+    pub rc_name: String,
+}
+
 #[derive(Debug, Protobuf, Default, Clone)]
 pub struct Aarch64TopologyConfig {
     pub gic_config: Option<GicConfig>,
     pub pmu_gsiv: PmuGsivConfig,
     pub gic_msi: GicMsiConfig,
+    /// SMMUv3 IOMMU instances. Each entry creates an SMMU for one PCIe root
+    /// complex (identified by name). Empty means no SMMU.
+    pub smmu: Vec<SmmuInstanceConfig>,
 }
 
 /// GIC configuration for the virtual machine.
diff --git a/openvmm/openvmm_entry/src/cli_args.rs b/openvmm/openvmm_entry/src/cli_args.rs
index b9889e8ad5..d2b10d17c0 100644
--- a/openvmm/openvmm_entry/src/cli_args.rs
+++ b/openvmm/openvmm_entry/src/cli_args.rs
@@ -391,6 +391,11 @@ options:
     #[clap(long, default_value = "auto")]
     pub gic_msi: GicMsiCli,
 
+    /// enable SMMUv3 IOMMU for an aarch64 PCIe root complex (repeatable, e.g. --smmu rc0 --smmu rc1)
+    #[cfg(guest_arch = "aarch64")]
+    #[clap(long, value_name = "RC_NAME")]
+    pub smmu: Vec<String>,
+
     /// COM1 binding (console | stderr | listen=\<path\> | file=\<path\> (overwrites) | listen=tcp:\<ip\>:\<port\> | term[=\<program\>]\[,name=\<windowtitle\>\] | none)
     #[clap(long, value_name = "SERIAL")]
     pub com1: Option<SerialConfigCli>,
diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs
index fbd1c888de..7e28c7bffb 100644
--- a/openvmm/openvmm_entry/src/lib.rs
+++ b/openvmm/openvmm_entry/src/lib.rs
@@ -1276,6 +1276,13 @@ async fn vm_config_from_command_line(
         vmbus_devices.push((openhcl_vtl, resource));
     }
 
+    #[cfg(guest_arch = "aarch64")]
+    let smmu_instances: Vec<openvmm_defs::config::SmmuInstanceConfig> = opt
+        .smmu
+        .iter()
+        .map(|s| openvmm_defs::config::SmmuInstanceConfig { rc_name: s.clone() })
+        .collect();
+
     #[cfg(guest_arch = "aarch64")]
     let topology_arch = openvmm_defs::config::ArchTopologyConfig::Aarch64(
         openvmm_defs::config::Aarch64TopologyConfig {
@@ -1289,6 +1296,7 @@ async fn vm_config_from_command_line(
                     openvmm_defs::config::GicMsiConfig::V2m { spi_count: None }
                 }
             },
+            smmu: smmu_instances,
         },
     );
     #[cfg(guest_arch = "x86_64")]
diff --git a/petri/src/vm/openvmm/modify.rs b/petri/src/vm/openvmm/modify.rs
index dcaadb322c..f7ea3077f5 100644
--- a/petri/src/vm/openvmm/modify.rs
+++ b/petri/src/vm/openvmm/modify.rs
@@ -29,6 +29,7 @@ use openvmm_defs::config::PcieMmioRangeConfig;
 use openvmm_defs::config::PcieRootComplexConfig;
 use openvmm_defs::config::PcieRootPortConfig;
 use openvmm_defs::config::PcieSwitchConfig;
+use openvmm_defs::config::SmmuInstanceConfig;
 use openvmm_defs::config::VpciDeviceConfig;
 use openvmm_defs::config::Vtl2BaseAddressType;
 use vm_resource::IntoResource;
@@ -301,6 +302,34 @@ impl PetriVmConfigOpenVmm {
         self
     }
 
+    /// Enable SMMUv3 IOMMU on the specified root complexes (aarch64 only).
+    ///
+    /// Each name must match a root complex added via
+    /// [`with_pcie_root_topology`](Self::with_pcie_root_topology). The SMMU
+    /// provides stage 1 IOVA translation for devices behind those root
+    /// complexes.
+    pub fn with_smmu(mut self, rc_names: &[&str]) -> Self {
+        let arch = self
+            .config
+            .processor_topology
+            .arch
+            .as_mut()
+            .expect("arch topology not set");
+
+        match arch {
+            openvmm_defs::config::ArchTopologyConfig::Aarch64(aarch64) => {
+                aarch64.smmu = rc_names
+                    .iter()
+                    .map(|name| SmmuInstanceConfig {
+                        rc_name: name.to_string(),
+                    })
+                    .collect();
+            }
+            _ => panic!("SMMU is only supported on aarch64"),
+        }
+        self
+    }
+
     /// This is intended for special one-off use cases. As soon as something
     /// is needed in multiple tests we should consider making it a supported
     /// pattern.
diff --git a/vm/acpi_spec/src/iort.rs b/vm/acpi_spec/src/iort.rs
index d8000f1bec..b0cd71df82 100644
--- a/vm/acpi_spec/src/iort.rs
+++ b/vm/acpi_spec/src/iort.rs
@@ -18,9 +18,11 @@ pub const IORT_NODE_OFFSET: u32 = size_of::<crate::Header>() as u32 + size_of::<
 
 pub const IORT_NODE_TYPE_ITS_GROUP: u8 = 0x00;
 pub const IORT_NODE_TYPE_PCI_ROOT_COMPLEX: u8 = 0x02;
+pub const IORT_NODE_TYPE_SMMUV3: u8 = 0x04;
 
 pub const IORT_PCI_ROOT_COMPLEX_REVISION: u8 = 3;
 pub const IORT_ITS_GROUP_REVISION: u8 = 1;
+pub const IORT_SMMUV3_REVISION: u8 = 5;
 
 pub const IORT_NODE_COHERENT: u32 = 0x00000001;
 pub const IORT_MEMORY_ACCESS_COHERENCY: u8 = 1 << 0;
@@ -203,3 +205,99 @@ impl IortItsGroup {
 }
 
 const_assert_eq!(size_of::<IortItsGroup>(), 20);
+
+/// SMMUv3 node flags.
+pub const IORT_SMMUV3_FLAG_COHACC: u32 = 1 << 0;
+/// `device_id_mapping_index` is valid (IORT rev E.e / node rev 5+).
+pub const IORT_SMMUV3_FLAG_DEVICEID_VALID: u32 = 1 << 4;
+
+/// SMMUv3 model: generic SMMU-v3.
+pub const IORT_SMMUV3_MODEL_GENERIC: u32 = 0;
+
+/// SMMUv3 node per IORT spec DEN0049E §E.4.
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)]
+pub struct IortSmmuV3 {
+    pub header: IortNodeHeader,
+    pub base_address: u64_ne,
+    pub flags: u32_ne,
+    pub reserved: u32_ne,
+    pub vatos_address: u64_ne,
+    pub model: u32_ne,
+    pub event_gsiv: u32_ne,
+    pub pri_gsiv: u32_ne,
+    pub gerr_gsiv: u32_ne,
+    pub sync_gsiv: u32_ne,
+    pub proximity_domain: u32_ne,
+    pub device_id_mapping_index: u32_ne,
+}
+
+impl IortSmmuV3 {
+    /// Create an SMMUv3 node with COHACC set, wired SPI interrupts (GSIVs),
+    /// and the specified number of ID mappings. The `length` field in the
+    /// header includes space for `mapping_count` trailing `IortIdMapping`
+    /// entries, which must be appended separately.
+    pub fn new(
+        identifier: u32,
+        base_address: u64,
+        mapping_count: u32,
+        event_gsiv: u32,
+        gerr_gsiv: u32,
+    ) -> Self {
+        Self::new_with_device_id_mapping(
+            identifier,
+            base_address,
+            mapping_count,
+            event_gsiv,
+            gerr_gsiv,
+            0,
+        )
+    }
+
+    /// Create an SMMUv3 node with an explicit `device_id_mapping_index`.
+    ///
+    /// `device_id_mapping_index` selects which ID mapping entry Linux uses
+    /// for the SMMU's own MSI domain lookup. That mapping must have the
+    /// `IORT_ID_SINGLE_MAPPING` flag set. When set, the `DEVICEID_VALID`
+    /// flag is automatically added to the node flags.
+    pub fn new_with_device_id_mapping(
+        identifier: u32,
+        base_address: u64,
+        mapping_count: u32,
+        event_gsiv: u32,
+        gerr_gsiv: u32,
+        device_id_mapping_index: u32,
+    ) -> Self {
+        let mut header = IortNodeHeader::new::<Self>(
+            IORT_NODE_TYPE_SMMUV3,
+            IORT_SMMUV3_REVISION,
+            identifier,
+            mapping_count,
+        );
+        let total =
+            size_of::<Self>() as u16 + (mapping_count as u16) * size_of::<IortIdMapping>() as u16;
+        header.length = total.into();
+        Self {
+            header,
+            base_address: base_address.into(),
+            flags: (IORT_SMMUV3_FLAG_COHACC
+                | if mapping_count > 0 {
+                    IORT_SMMUV3_FLAG_DEVICEID_VALID
+                } else {
+                    0
+                })
+            .into(),
+            reserved: 0.into(),
+            vatos_address: 0.into(),
+            model: IORT_SMMUV3_MODEL_GENERIC.into(),
+            event_gsiv: event_gsiv.into(),
+            pri_gsiv: 0.into(),
+            gerr_gsiv: gerr_gsiv.into(),
+            sync_gsiv: 0.into(),
+            proximity_domain: 0.into(),
+            device_id_mapping_index: device_id_mapping_index.into(),
+        }
+    }
+}
+
+const_assert_eq!(size_of::<IortSmmuV3>(), 68);
diff --git a/vm/devices/iommu/smmu/Cargo.toml b/vm/devices/iommu/smmu/Cargo.toml
new file mode 100644
index 0000000000..c6bc7868cd
--- /dev/null
+++ b/vm/devices/iommu/smmu/Cargo.toml
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+[package]
+name = "smmu"
+edition.workspace = true
+rust-version.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+bitfield-struct.workspace = true
+chipset_device.workspace = true
+guestmem.workspace = true
+inspect.workspace = true
+open_enum.workspace = true
+pal_event.workspace = true
+parking_lot.workspace = true
+pci_core.workspace = true
+tracelimit.workspace = true
+vmcore.workspace = true
+zerocopy.workspace = true
+
+[lints]
+workspace = true
diff --git a/vm/devices/iommu/smmu/src/emulator.rs b/vm/devices/iommu/smmu/src/emulator.rs
new file mode 100644
index 0000000000..65d5a6fd2e
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/emulator.rs
@@ -0,0 +1,2139 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 device emulator — register file and MMIO dispatch.
+
+use crate::shared::SmmuSharedState;
+use crate::spec::commands::CmdEntry;
+use crate::spec::commands::CmdOpcode;
+use crate::spec::commands::CmdSync;
+use crate::spec::commands::SyncCs;
+use crate::spec::registers;
+use chipset_device::ChipsetDevice;
+use chipset_device::io::IoError;
+use chipset_device::io::IoResult;
+use chipset_device::mmio::MmioIntercept;
+use guestmem::GuestMemory;
+use inspect::Inspect;
+use inspect::InspectMut;
+use std::ops::RangeInclusive;
+use std::sync::Arc;
+use vmcore::device_state::ChangeDeviceState;
+use vmcore::line_interrupt::LineInterrupt;
+use vmcore::save_restore::RestoreError;
+use vmcore::save_restore::SaveError;
+use vmcore::save_restore::SaveRestore;
+use vmcore::save_restore::SavedStateNotSupported;
+
+/// SMMUv3 device configuration.
+#[derive(Debug, Clone)]
+pub struct SmmuConfig {
+    /// Number of StreamID bits (max 32, typically 16).
+    pub sid_bits: u8,
+    /// Output address size encoding (IDR5.OAS value).
+    pub oas: u8,
+}
+
+impl Default for SmmuConfig {
+    fn default() -> Self {
+        Self {
+            sid_bits: 16,
+            oas: 0b010, // 40-bit OAS
+        }
+    }
+}
+
+/// Per-queue MSI configuration registers.
+#[derive(Debug, Default, Inspect)]
+struct MsiConfig {
+    /// MSI address (64-bit, from IRQ_CFG0).
+    addr: u64,
+    /// MSI data payload (32-bit, from IRQ_CFG1).
+    data: u32,
+    /// MSI attributes (32-bit, from IRQ_CFG2).
+    attr: u32,
+}
+
+/// SMMUv3 device emulator.
+///
+/// Implements MMIO register access for the SMMUv3 register file. The device
+/// responds to reads/writes across a 128KB region (page 0 + page 1).
+#[derive(InspectMut)]
+pub struct SmmuDevice {
+    // Static configuration
+    #[inspect(skip)]
+    mmio_region: (&'static str, RangeInclusive<u64>),
+    #[inspect(skip)]
+    mmio_base: u64,
+
+    // Guest memory for reading queues and page tables.
+    #[inspect(skip)]
+    guest_memory: GuestMemory,
+
+    // Shared state for per-device translation wrappers.
+    #[inspect(skip)]
+    shared_state: Arc<SmmuSharedState>,
+
+    // Identification registers (read-only, set at construction).
+    idr0: registers::Idr0,
+    idr1: registers::Idr1,
+    #[inspect(hex)]
+    idr2: u32,
+    #[inspect(hex)]
+    idr3: u32,
+    #[inspect(hex)]
+    idr4: u32,
+    idr5: registers::Idr5,
+    #[inspect(hex)]
+    iidr: u32,
+    #[inspect(hex)]
+    aidr: u32,
+
+    // Control registers.
+    cr0: registers::Cr0,
+    cr0ack: registers::Cr0,
+    cr1: registers::Cr1,
+    cr2: registers::Cr2,
+    gbpa: registers::Gbpa,
+
+    // Interrupt control.
+    irq_ctrl: registers::IrqCtrl,
+    irq_ctrlack: registers::IrqCtrl,
+
+    // Stream table base.
+    #[inspect(hex)]
+    strtab_base: u64,
+    strtab_base_cfg: registers::StrtabBaseCfg,
+
+    // Command queue.
+    #[inspect(hex)]
+    cmdq_base: u64,
+    cmdq_prod: u32,
+    cmdq_cons: registers::CmdqCons,
+
+    // Event queue base register (raw value for MMIO read/write).
+    // EVTQ producer/consumer state lives in SmmuSharedState.
+    #[inspect(hex)]
+    evtq_base: u64,
+
+    // MSI configuration (stored for guest register access, not used for
+    // interrupt delivery since IDR0.MSI=0).
+    gerror_msi: MsiConfig,
+    evtq_msi: MsiConfig,
+    cmdq_msi: MsiConfig,
+}
+
+impl SmmuDevice {
+    /// Creates a new SMMUv3 device.
+    ///
+    /// `mmio_base` is the physical address for the 128KB MMIO region.
+    /// `guest_memory` is used for reading command/event queues and page tables.
+    /// `evtq_irq` and `gerror_irq` are wired SPI interrupt lines for event
+    /// queue and global error signaling.
+    pub fn new(
+        mmio_base: u64,
+        guest_memory: GuestMemory,
+        config: &SmmuConfig,
+        evtq_irq: Option<LineInterrupt>,
+        gerror_irq: Option<LineInterrupt>,
+    ) -> Self {
+        let idr0 = registers::Idr0::new()
+            .with_s1p(true)
+            .with_s2p(false)
+            .with_ttf(0b10) // AArch64 only
+            .with_cohacc(true)
+            .with_asid16(true)
+            .with_msi(false)
+            .with_ttendian(0b10) // Little-endian
+            .with_st_level(0b00); // Linear stream table only
+
+        let idr1 = registers::Idr1::new()
+            .with_sidsize(config.sid_bits)
+            .with_ssidsize(0)
+            .with_cmdqs(8) // 256 entries max
+            .with_eventqs(8) // 256 entries max
+            .with_attr_types_ovr(true)
+            .with_tables_preset(false)
+            .with_queues_preset(false)
+            .with_rel(false);
+
+        let idr5 = registers::Idr5::new()
+            .with_oas(config.oas)
+            .with_gran4k(true)
+            .with_gran16k(false)
+            .with_gran64k(false);
+
+        // GBPA defaults to ABORT=1 (abort all transactions when SMMU is disabled).
+        let gbpa = registers::Gbpa::new().with_abort(true);
+
+        let shared_state = SmmuSharedState::new(guest_memory.clone(), evtq_irq, gerror_irq);
+
+        SmmuDevice {
+            mmio_region: (
+                "smmu",
+                mmio_base..=mmio_base + registers::MMIO_REGION_SIZE - 1,
+            ),
+            mmio_base,
+            guest_memory,
+            shared_state,
+
+            idr0,
+            idr1,
+            idr2: 0,
+            idr3: 0,
+            idr4: 0,
+            idr5,
+            iidr: 0,
+            aidr: 0x03, // SMMUv3.3
+
+            cr0: registers::Cr0::new(),
+            cr0ack: registers::Cr0::new(),
+            cr1: registers::Cr1::new(),
+            cr2: registers::Cr2::new(),
+            gbpa,
+
+            irq_ctrl: registers::IrqCtrl::new(),
+            irq_ctrlack: registers::IrqCtrl::new(),
+
+            strtab_base: 0,
+            strtab_base_cfg: registers::StrtabBaseCfg::new(),
+
+            cmdq_base: 0,
+            cmdq_prod: 0,
+            cmdq_cons: registers::CmdqCons::new(),
+
+            evtq_base: 0,
+
+            gerror_msi: MsiConfig::default(),
+            evtq_msi: MsiConfig::default(),
+            cmdq_msi: MsiConfig::default(),
+        }
+    }
+
+    /// Returns the shared state for creating per-device translation wrappers.
+    pub fn shared_state(&self) -> &Arc<SmmuSharedState> {
+        &self.shared_state
+    }
+
+    /// Handles a 32-bit MMIO read at the given offset from the device base.
+    fn read_reg32(&self, offset: u32) -> u32 {
+        match offset as u16 {
+            registers::IDR0 => self.idr0.into(),
+            registers::IDR1 => self.idr1.into(),
+            registers::IDR2 => self.idr2,
+            registers::IDR3 => self.idr3,
+            registers::IDR4 => self.idr4,
+            registers::IDR5 => self.idr5.into(),
+            registers::IIDR => self.iidr,
+            registers::AIDR => self.aidr,
+
+            registers::CR0 => self.cr0.into(),
+            registers::CR0ACK => self.cr0ack.into(),
+            registers::CR1 => self.cr1.into(),
+            registers::CR2 => self.cr2.into(),
+            registers::STATUSR => 0,
+            registers::GBPA => self.gbpa.into(),
+            registers::AGBPA => 0,
+
+            registers::IRQ_CTRL => self.irq_ctrl.into(),
+            registers::IRQ_CTRLACK => self.irq_ctrlack.into(),
+
+            registers::GERROR => self.shared_state.read_gerror().into(),
+            registers::GERRORN => self.shared_state.read_gerrorn().into(),
+
+            registers::STRTAB_BASE_CFG => self.strtab_base_cfg.into(),
+
+            registers::CMDQ_PROD => self.cmdq_prod,
+            registers::CMDQ_CONS => self.cmdq_cons.into(),
+
+            // Page 0 read of GERROR_IRQ_CFG1
+            registers::GERROR_IRQ_CFG1 => self.gerror_msi.data,
+            registers::GERROR_IRQ_CFG2 => self.gerror_msi.attr,
+
+            // Page 0 read of EVENTQ_IRQ_CFG1
+            registers::EVENTQ_IRQ_CFG1 => self.evtq_msi.data,
+            registers::EVENTQ_IRQ_CFG2 => self.evtq_msi.attr,
+
+            _ => {
+                tracelimit::warn_ratelimited!(offset, "smmu: unhandled 32-bit MMIO read");
+                0
+            }
+        }
+    }
+
+    /// Handles a 64-bit MMIO read at the given offset from the device base.
+    fn read_reg64(&self, offset: u32) -> u64 {
+        match offset as u16 {
+            registers::STRTAB_BASE => self.strtab_base,
+            registers::CMDQ_BASE => self.cmdq_base,
+            registers::EVENTQ_BASE => self.evtq_base,
+            registers::GERROR_IRQ_CFG0 => self.gerror_msi.addr,
+            registers::EVENTQ_IRQ_CFG0 => self.evtq_msi.addr,
+            _ => {
+                tracelimit::warn_ratelimited!(offset, "smmu: unhandled 64-bit MMIO read");
+                0
+            }
+        }
+    }
+
+    /// Handles a 32-bit MMIO write at the given offset.
+    fn write_reg32(&mut self, offset: u32, value: u32) {
+        match offset as u16 {
+            // Read-only registers: ignore writes.
+            registers::IDR0
+            | registers::IDR1
+            | registers::IDR2
+            | registers::IDR3
+            | registers::IDR4
+            | registers::IDR5
+            | registers::IIDR
+            | registers::AIDR
+            | registers::CR0ACK
+            | registers::STATUSR
+            | registers::IRQ_CTRLACK => {}
+
+            registers::CR0 => {
+                self.cr0 = registers::Cr0::from(value);
+                // Immediate acknowledge — no async enable sequence.
+                self.cr0ack = self.cr0;
+                // Sync enable state to shared state for per-device wrappers.
+                self.shared_state.set_enabled(self.cr0.smmuen());
+                self.shared_state.set_evtq_enabled(self.cr0.eventqen());
+            }
+            registers::CR1 => {
+                self.cr1 = registers::Cr1::from(value);
+            }
+            registers::CR2 => {
+                self.cr2 = registers::Cr2::from(value);
+            }
+            registers::GBPA => {
+                // Clear the UPDATE bit on write (the SMMU "completes" the
+                // update immediately).
+                let mut gbpa = registers::Gbpa::from(value);
+                gbpa.set_update(false);
+                self.gbpa = gbpa;
+            }
+            registers::IRQ_CTRL => {
+                self.irq_ctrl = registers::IrqCtrl::from(value);
+                // Immediate acknowledge.
+                self.irq_ctrlack = self.irq_ctrl;
+                self.shared_state
+                    .set_irq_ctrl(self.irq_ctrl.eventq_irqen(), self.irq_ctrl.gerror_irqen());
+            }
+            registers::GERRORN => {
+                self.shared_state.write_gerrorn(value);
+            }
+
+            registers::STRTAB_BASE_CFG => {
+                self.strtab_base_cfg = registers::StrtabBaseCfg::from(value);
+                self.sync_strtab_to_shared();
+            }
+
+            registers::CMDQ_PROD => {
+                self.cmdq_prod = value;
+                self.process_cmdq();
+            }
+            registers::CMDQ_CONS => {
+                // CMDQ_CONS is writable by the SMMU only (for error reporting).
+                // Guest writes are ignored per spec.
+            }
+
+            registers::GERROR_IRQ_CFG1 => self.gerror_msi.data = value,
+            registers::GERROR_IRQ_CFG2 => self.gerror_msi.attr = value,
+
+            registers::EVENTQ_IRQ_CFG1 => self.evtq_msi.data = value,
+            registers::EVENTQ_IRQ_CFG2 => self.evtq_msi.attr = value,
+
+            _ => {
+                tracelimit::warn_ratelimited!(offset, value, "smmu: unhandled 32-bit MMIO write");
+            }
+        }
+    }
+
+    /// Handles a 64-bit MMIO write at the given offset.
+    fn write_reg64(&mut self, offset: u32, value: u64) {
+        match offset as u16 {
+            registers::STRTAB_BASE => {
+                self.strtab_base = value;
+                self.sync_strtab_to_shared();
+            }
+            registers::CMDQ_BASE => {
+                self.cmdq_base = value;
+            }
+            registers::EVENTQ_BASE => {
+                self.evtq_base = value;
+                self.sync_evtq_to_shared();
+            }
+            registers::GERROR_IRQ_CFG0 => self.gerror_msi.addr = value,
+            registers::EVENTQ_IRQ_CFG0 => self.evtq_msi.addr = value,
+
+            _ => {
+                tracelimit::warn_ratelimited!(offset, value, "smmu: unhandled 64-bit MMIO write");
+            }
+        }
+    }
+
+    /// Handles page 1 register reads (offset >= 0x10000).
+    fn read_page1_reg32(&self, offset: u32) -> u32 {
+        match offset {
+            registers::EVENTQ_PROD_PAGE1 => self.shared_state.evtq_prod(),
+            registers::EVENTQ_CONS_PAGE1 => self.shared_state.evtq_cons(),
+            registers::CMDQ_IRQ_CFG1_PAGE1 => self.cmdq_msi.data,
+            registers::CMDQ_IRQ_CFG2_PAGE1 => self.cmdq_msi.attr,
+            _ => {
+                tracelimit::warn_ratelimited!(offset, "smmu: unhandled page 1 32-bit MMIO read");
+                0
+            }
+        }
+    }
+
+    /// Handles page 1 register reads (64-bit, offset >= 0x10000).
+    fn read_page1_reg64(&self, offset: u32) -> u64 {
+        match offset {
+            registers::CMDQ_IRQ_CFG0_PAGE1 => self.cmdq_msi.addr,
+            _ => {
+                tracelimit::warn_ratelimited!(offset, "smmu: unhandled page 1 64-bit MMIO read");
+                0
+            }
+        }
+    }
+
+    /// Handles page 1 register writes (offset >= 0x10000).
+    fn write_page1_reg32(&mut self, offset: u32, value: u32) {
+        match offset {
+            registers::EVENTQ_PROD_PAGE1 => {
+                // EVTQ_PROD on page 1 is writable by the SMMU only (for
+                // writing events). Guest writes are ignored.
+            }
+            registers::EVENTQ_CONS_PAGE1 => {
+                self.shared_state.set_evtq_cons(value);
+            }
+            registers::CMDQ_IRQ_CFG1_PAGE1 => self.cmdq_msi.data = value,
+            registers::CMDQ_IRQ_CFG2_PAGE1 => self.cmdq_msi.attr = value,
+            _ => {
+                tracelimit::warn_ratelimited!(
+                    offset,
+                    value,
+                    "smmu: unhandled page 1 32-bit MMIO write"
+                );
+            }
+        }
+    }
+
+    /// Handles page 1 register writes (64-bit, offset >= 0x10000).
+    fn write_page1_reg64(&mut self, offset: u32, value: u64) {
+        match offset {
+            registers::CMDQ_IRQ_CFG0_PAGE1 => self.cmdq_msi.addr = value,
+            _ => {
+                tracelimit::warn_ratelimited!(
+                    offset,
+                    value,
+                    "smmu: unhandled page 1 64-bit MMIO write"
+                );
+            }
+        }
+    }
+
+    // =========================================================================
+    // Shared State Synchronization
+    // =========================================================================
+
+    /// Sync the stream table base address and size to the shared state.
+    fn sync_strtab_to_shared(&self) {
+        let base = registers::StrtabBase::from(self.strtab_base).addr();
+        let log2size = self.strtab_base_cfg.log2size();
+        self.shared_state.set_strtab(base, log2size);
+    }
+
+    /// Sync the event queue base address and size to the shared state.
+    fn sync_evtq_to_shared(&self) {
+        let base_addr = registers::QueueBase::from(self.evtq_base).addr();
+        let raw_log2size = registers::QueueBase::from(self.evtq_base).log2size();
+        let log2size = raw_log2size.min(self.idr1.eventqs());
+        self.shared_state.set_evtq_config(base_addr, log2size);
+    }
+
+    // =========================================================================
+    // Command Queue Processing
+    // =========================================================================
+
+    /// Returns the log2 size of the command queue from CMDQ_BASE,
+    /// clamped to the maximum advertised in IDR1.CMDQS.
+    fn cmdq_log2size(&self) -> u8 {
+        let raw = registers::QueueBase::from(self.cmdq_base).log2size();
+        let max = self.idr1.cmdqs();
+        raw.min(max)
+    }
+
+    /// Returns the base GPA of the command queue from CMDQ_BASE.
+    fn cmdq_base_addr(&self) -> u64 {
+        registers::QueueBase::from(self.cmdq_base).addr()
+    }
+
+    /// Checks if CMDQ processing is enabled (CMDQEN set and SMMU enabled
+    /// or at least CMDQEN in CR0).
+    fn cmdq_enabled(&self) -> bool {
+        self.cr0.cmdqen()
+    }
+
+    /// Returns true if the CMDQ has a pending (active, unacknowledged) error.
+    fn cmdq_has_error(&self) -> bool {
+        self.shared_state.cmdq_err_active()
+    }
+
+    /// Process all pending commands in the command queue.
+    ///
+    /// Called when the guest writes CMDQ_PROD. Consumes commands from
+    /// CMDQ_CONS up to CMDQ_PROD, dispatching each by opcode.
+    fn process_cmdq(&mut self) {
+        if !self.cmdq_enabled() {
+            return;
+        }
+
+        // Don't process if there's an outstanding CMDQ error.
+        if self.cmdq_has_error() {
+            return;
+        }
+
+        let log2size = self.cmdq_log2size() as u32;
+        let max_entries = 1u32 << log2size;
+        // The wrap mask includes the wrap bit: (2 * max_entries - 1).
+        let index_mask = (max_entries << 1) - 1;
+        let base_addr = self.cmdq_base_addr();
+
+        // Extract the raw cons value (bits [19:0] include the wrap bit).
+        let mut cons = self.cmdq_cons.rd();
+        let prod = self.cmdq_prod & index_mask;
+
+        // Limit iterations to prevent infinite loops on malformed state.
+        let mut iterations = 0u32;
+
+        while cons != prod {
+            if iterations >= max_entries {
+                // Safety valve: should never happen with well-behaved software.
+                tracelimit::warn_ratelimited!("smmu: CMDQ processing exceeded max iterations");
+                break;
+            }
+            iterations += 1;
+
+            // Compute the entry address: index within the queue (without wrap bit).
+            let index = cons & (max_entries - 1);
+            let entry_addr = base_addr + (index as u64) * (size_of::<CmdEntry>() as u64);
+
+            // Read the 16-byte command entry from guest memory.
+            let entry = match self.guest_memory.read_plain::<CmdEntry>(entry_addr) {
+                Ok(entry) => entry,
+                Err(e) => {
+                    tracelimit::warn_ratelimited!(
+                        error = &e as &dyn std::error::Error,
+                        entry_addr,
+                        "smmu: failed to read CMDQ entry from guest memory"
+                    );
+                    // Set CMDQ error: abort.
+                    self.set_cmdq_error(registers::CmdqError::CERROR_ABT);
+                    break;
+                }
+            };
+
+            match entry.opcode() {
+                // Configuration invalidation commands — no-op (no cache yet).
+                CmdOpcode::PREFETCH_CFG
+                | CmdOpcode::CFGI_STE
+                | CmdOpcode::CFGI_STE_RANGE
+                | CmdOpcode::CFGI_CD
+                | CmdOpcode::CFGI_CD_ALL => {}
+
+                // TLB invalidation commands — no-op (no TLB yet).
+                CmdOpcode::TLBI_NH_ALL
+                | CmdOpcode::TLBI_NH_ASID
+                | CmdOpcode::TLBI_NH_VA
+                | CmdOpcode::TLBI_NH_VAA
+                | CmdOpcode::TLBI_S12_VMALL
+                | CmdOpcode::TLBI_NSNH_ALL => {}
+
+                // Synchronization command.
+                CmdOpcode::CMD_SYNC => {
+                    self.handle_cmd_sync(&entry);
+                }
+
+                // Unknown opcode — set CMDQ error.
+                opcode => {
+                    tracelimit::warn_ratelimited!(?opcode, "smmu: unknown CMDQ opcode");
+                    self.set_cmdq_error(registers::CmdqError::CERROR_ILL);
+                    break;
+                }
+            }
+
+            // Advance cons with wrap.
+            cons = (cons + 1) & index_mask;
+        }
+
+        // Update the stored CMDQ_CONS (preserve error field, update rd).
+        self.cmdq_cons.set_rd(cons);
+    }
+
+    /// Handle a CMD_SYNC command.
+    ///
+    /// With IDR0.MSI=0, Linux uses CS=SIG_SEV and polls CMDQ_CONS for
+    /// completion. The MSIWrite path is kept for spec compliance but won't
+    /// be exercised by Linux when MSI is not advertised.
+    fn handle_cmd_sync(&mut self, entry: &CmdEntry) {
+        let cmd = CmdSync::from(entry.qw0);
+        let cs = SyncCs(cmd.cs());
+
+        match cs {
+            SyncCs::SIG_NONE | SyncCs::SIG_SEV => {
+                // No signal or SEV — nothing to do. Linux polls CMDQ_CONS.
+            }
+            SyncCs::SIG_IRQ => {
+                // Write MSI data to MSI address in guest memory (RAM polling).
+                let msi_addr = CmdSync::msi_write_addr_from_entry(entry);
+                let msi_data = cmd.msi_data();
+
+                if msi_addr != 0 {
+                    if let Err(e) = self
+                        .guest_memory
+                        .write_at(msi_addr, &msi_data.to_le_bytes())
+                    {
+                        tracelimit::warn_ratelimited!(
+                            error = &e as &dyn std::error::Error,
+                            msi_addr,
+                            "smmu: failed to write CMD_SYNC MSI to guest memory"
+                        );
+                    }
+                }
+            }
+            _ => {
+                tracelimit::warn_ratelimited!(cs = cs.0, "smmu: unknown CMD_SYNC CS value");
+            }
+        }
+    }
+
+    /// Set a command queue error, toggling GERROR.CMDQ_ERR and storing the
+    /// error code in CMDQ_CONS.
+    fn set_cmdq_error(&mut self, error: registers::CmdqError) {
+        // Set error code in CMDQ_CONS.
+        self.cmdq_cons.set_err(error.0);
+        // Toggle GERROR.CMDQ_ERR and update interrupt line (atomic).
+        self.shared_state.toggle_cmdq_err();
+    }
+
+    // =========================================================================
+    // Event Queue
+    // =========================================================================
+}
+
+impl ChipsetDevice for SmmuDevice {
+    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
+        Some(self)
+    }
+}
+
+impl ChangeDeviceState for SmmuDevice {
+    fn start(&mut self) {}
+
+    async fn stop(&mut self) {}
+
+    async fn reset(&mut self) {
+        let SmmuDevice {
+            // Static configuration — not reset.
+            mmio_region: _,
+            mmio_base: _,
+            guest_memory: _,
+            shared_state,
+
+            // Identification registers — read-only, not reset.
+            idr0: _,
+            idr1: _,
+            idr2: _,
+            idr3: _,
+            idr4: _,
+            idr5: _,
+            iidr: _,
+            aidr: _,
+
+            // Control registers — reset to power-on defaults.
+            cr0,
+            cr0ack,
+            cr1,
+            cr2,
+            gbpa,
+
+            // Interrupt control.
+            irq_ctrl,
+            irq_ctrlack,
+
+            // Stream table base.
+            strtab_base,
+            strtab_base_cfg,
+
+            // Command queue.
+            cmdq_base,
+            cmdq_prod,
+            cmdq_cons,
+
+            // Event queue base register.
+            evtq_base,
+
+            // MSI configuration.
+            gerror_msi,
+            evtq_msi,
+            cmdq_msi,
+        } = self;
+
+        *cr0 = registers::Cr0::new();
+        *cr0ack = registers::Cr0::new();
+        *cr1 = registers::Cr1::new();
+        *cr2 = registers::Cr2::new();
+        *gbpa = registers::Gbpa::new().with_abort(true);
+
+        *irq_ctrl = registers::IrqCtrl::new();
+        *irq_ctrlack = registers::IrqCtrl::new();
+
+        *strtab_base = 0;
+        *strtab_base_cfg = registers::StrtabBaseCfg::new();
+
+        *cmdq_base = 0;
+        *cmdq_prod = 0;
+        *cmdq_cons = registers::CmdqCons::new();
+
+        *evtq_base = 0;
+
+        *gerror_msi = MsiConfig::default();
+        *evtq_msi = MsiConfig::default();
+        *cmdq_msi = MsiConfig::default();
+
+        // Sync disabled state to shared state so per-device wrappers
+        // bypass translation immediately.
+        shared_state.set_enabled(false);
+        shared_state.set_strtab(0, 0);
+        // Reset EVTQ state (prod, cons, config, enabled).
+        // Reset GERROR state and deassert interrupt.
+        shared_state.reset_queue_state();
+    }
+}
+
+impl SaveRestore for SmmuDevice {
+    type SavedState = SavedStateNotSupported;
+
+    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
+        Err(SaveError::NotSupported)
+    }
+
+    fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
+        match state {}
+    }
+}
+
+impl MmioIntercept for SmmuDevice {
+    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
+        let offset = (addr - self.mmio_base) as u32;
+
+        if offset >= 0x10000 {
+            // Page 1 register access.
+            match data.len() {
+                4 => {
+                    let value = self.read_page1_reg32(offset);
+                    data.copy_from_slice(&value.to_le_bytes());
+                }
+                8 => {
+                    let value = self.read_page1_reg64(offset);
+                    data.copy_from_slice(&value.to_le_bytes());
+                }
+                _ => return IoResult::Err(IoError::InvalidAccessSize),
+            }
+        } else {
+            // Page 0 register access.
+            match data.len() {
+                4 => {
+                    let value = self.read_reg32(offset);
+                    data.copy_from_slice(&value.to_le_bytes());
+                }
+                8 => {
+                    let value = self.read_reg64(offset);
+                    data.copy_from_slice(&value.to_le_bytes());
+                }
+                _ => return IoResult::Err(IoError::InvalidAccessSize),
+            }
+        }
+
+        IoResult::Ok
+    }
+
+    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
+        let offset = (addr - self.mmio_base) as u32;
+
+        if offset >= 0x10000 {
+            // Page 1 register access.
+            match data.len() {
+                4 => {
+                    let value = u32::from_le_bytes(data.try_into().unwrap());
+                    self.write_page1_reg32(offset, value);
+                }
+                8 => {
+                    let value = u64::from_le_bytes(data.try_into().unwrap());
+                    self.write_page1_reg64(offset, value);
+                }
+                _ => return IoResult::Err(IoError::InvalidAccessSize),
+            }
+        } else {
+            // Page 0 register access.
+            match data.len() {
+                4 => {
+                    let value = u32::from_le_bytes(data.try_into().unwrap());
+                    self.write_reg32(offset, value);
+                }
+                8 => {
+                    let value = u64::from_le_bytes(data.try_into().unwrap());
+                    self.write_reg64(offset, value);
+                }
+                _ => return IoResult::Err(IoError::InvalidAccessSize),
+            }
+        }
+
+        IoResult::Ok
+    }
+
+    fn get_static_regions(&mut self) -> &[(&str, RangeInclusive<u64>)] {
+        std::slice::from_ref(&self.mmio_region)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::spec::events::EvtEntry;
+    use crate::spec::registers::*;
+
+    const TEST_MMIO_BASE: u64 = 0x0900_0000;
+
+    fn make_test_device() -> SmmuDevice {
+        let gm = GuestMemory::empty();
+        SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None)
+    }
+
+    /// Helper to read a 32-bit register.
+    fn read32(dev: &mut SmmuDevice, reg_offset: u16) -> u32 {
+        let mut data = [0u8; 4];
+        let result = dev.mmio_read(TEST_MMIO_BASE + reg_offset as u64, &mut data);
+        assert!(matches!(result, IoResult::Ok));
+        u32::from_le_bytes(data)
+    }
+
+    /// Helper to write a 32-bit register.
+    fn write32(dev: &mut SmmuDevice, reg_offset: u16, value: u32) {
+        let data = value.to_le_bytes();
+        let result = dev.mmio_write(TEST_MMIO_BASE + reg_offset as u64, &data);
+        assert!(matches!(result, IoResult::Ok));
+    }
+
+    /// Helper to read a 64-bit register.
+    fn read64(dev: &mut SmmuDevice, reg_offset: u16) -> u64 {
+        let mut data = [0u8; 8];
+        let result = dev.mmio_read(TEST_MMIO_BASE + reg_offset as u64, &mut data);
+        assert!(matches!(result, IoResult::Ok));
+        u64::from_le_bytes(data)
+    }
+
+    /// Helper to write a 64-bit register.
+    fn write64(dev: &mut SmmuDevice, reg_offset: u16, value: u64) {
+        let data = value.to_le_bytes();
+        let result = dev.mmio_write(TEST_MMIO_BASE + reg_offset as u64, &data);
+        assert!(matches!(result, IoResult::Ok));
+    }
+
+    /// Helper to read a 32-bit page 1 register (offset >= 0x10000).
+    fn read32_page1(dev: &mut SmmuDevice, abs_offset: u32) -> u32 {
+        let mut data = [0u8; 4];
+        let result = dev.mmio_read(TEST_MMIO_BASE + abs_offset as u64, &mut data);
+        assert!(matches!(result, IoResult::Ok));
+        u32::from_le_bytes(data)
+    }
+
+    /// Helper to write a 32-bit page 1 register.
+    fn write32_page1(dev: &mut SmmuDevice, abs_offset: u32, value: u32) {
+        let data = value.to_le_bytes();
+        let result = dev.mmio_write(TEST_MMIO_BASE + abs_offset as u64, &data);
+        assert!(matches!(result, IoResult::Ok));
+    }
+
+    /// Helper to read a 64-bit page 1 register.
+    fn read64_page1(dev: &mut SmmuDevice, abs_offset: u32) -> u64 {
+        let mut data = [0u8; 8];
+        let result = dev.mmio_read(TEST_MMIO_BASE + abs_offset as u64, &mut data);
+        assert!(matches!(result, IoResult::Ok));
+        u64::from_le_bytes(data)
+    }
+
+    /// Helper to write a 64-bit page 1 register.
+    fn write64_page1(dev: &mut SmmuDevice, abs_offset: u32, value: u64) {
+        let data = value.to_le_bytes();
+        let result = dev.mmio_write(TEST_MMIO_BASE + abs_offset as u64, &data);
+        assert!(matches!(result, IoResult::Ok));
+    }
+
+    #[test]
+    fn test_idr_readback() {
+        let mut dev = make_test_device();
+
+        // IDR0: S1P=1, TTF=0b10, COHACC=1, ASID16=1, MSI=1, TTENDIAN=0b10,
+        //       ST_LVL=0b00
+        let idr0 = Idr0::from(read32(&mut dev, IDR0));
+        assert!(idr0.s1p());
+        assert!(!idr0.s2p());
+        assert_eq!(idr0.ttf(), 0b10);
+        assert!(idr0.cohacc());
+        assert!(idr0.asid16());
+        assert!(!idr0.msi());
+        assert_eq!(idr0.ttendian(), 0b10);
+        assert_eq!(idr0.st_level(), 0b00);
+
+        // IDR1: SIDSIZE=16, CMDQS=8, EVTQS=8, ATTR_TYPES_OVR=1
+        let idr1 = Idr1::from(read32(&mut dev, IDR1));
+        assert_eq!(idr1.sidsize(), 16);
+        assert_eq!(idr1.cmdqs(), 8);
+        assert_eq!(idr1.eventqs(), 8);
+        assert!(idr1.attr_types_ovr());
+        assert!(!idr1.tables_preset());
+        assert!(!idr1.queues_preset());
+        assert!(!idr1.rel());
+
+        // IDR2, IDR3, IDR4 = 0
+        assert_eq!(read32(&mut dev, IDR2), 0);
+        assert_eq!(read32(&mut dev, IDR3), 0);
+        assert_eq!(read32(&mut dev, IDR4), 0);
+
+        // IDR5: GRAN4K=1, OAS=0b010 (40-bit)
+        let idr5 = Idr5::from(read32(&mut dev, IDR5));
+        assert!(idr5.gran4k());
+        assert!(!idr5.gran16k());
+        assert!(!idr5.gran64k());
+        assert_eq!(idr5.oas(), 0b010);
+
+        // IIDR = 0
+        assert_eq!(read32(&mut dev, IIDR), 0);
+
+        // AIDR = 0x03 (SMMUv3.3)
+        assert_eq!(read32(&mut dev, AIDR), 0x03);
+    }
+
+    #[test]
+    fn test_cr0_ack_echo() {
+        let mut dev = make_test_device();
+
+        // Write CR0 with all enable bits.
+        let cr0_val = Cr0::new()
+            .with_smmuen(true)
+            .with_cmdqen(true)
+            .with_eventqen(true);
+        write32(&mut dev, CR0, cr0_val.into());
+
+        // CR0ACK should match.
+        let ack = read32(&mut dev, CR0ACK);
+        assert_eq!(ack, u32::from(cr0_val));
+    }
+
+    #[test]
+    fn test_cr0_enable_sequence() {
+        let mut dev = make_test_device();
+
+        // Step 1: Enable CMDQ.
+        let cr0_cmdq = Cr0::new().with_cmdqen(true);
+        write32(&mut dev, CR0, cr0_cmdq.into());
+        let ack = Cr0::from(read32(&mut dev, CR0ACK));
+        assert!(ack.cmdqen());
+        assert!(!ack.eventqen());
+        assert!(!ack.smmuen());
+
+        // Step 2: Enable EVTQ.
+        let cr0_evtq = cr0_cmdq.with_eventqen(true);
+        write32(&mut dev, CR0, cr0_evtq.into());
+        let ack = Cr0::from(read32(&mut dev, CR0ACK));
+        assert!(ack.cmdqen());
+        assert!(ack.eventqen());
+        assert!(!ack.smmuen());
+
+        // Step 3: Enable SMMU.
+        let cr0_full = cr0_evtq.with_smmuen(true);
+        write32(&mut dev, CR0, cr0_full.into());
+        let ack = Cr0::from(read32(&mut dev, CR0ACK));
+        assert!(ack.cmdqen());
+        assert!(ack.eventqen());
+        assert!(ack.smmuen());
+    }
+
+    #[test]
+    fn test_strtab_base_readback() {
+        let mut dev = make_test_device();
+
+        // Write a 64-bit STRTAB_BASE with address and RA hint.
+        let base = StrtabBase::new()
+            .with_addr_bits(0x1234_5678_9AB0u64 >> 6)
+            .with_ra(true);
+        write64(&mut dev, STRTAB_BASE, base.into());
+
+        let readback = StrtabBase::from(read64(&mut dev, STRTAB_BASE));
+        assert_eq!(readback.addr(), base.addr());
+        assert!(readback.ra());
+
+        // Write STRTAB_BASE_CFG.
+        let cfg = StrtabBaseCfg::new().with_log2size(10).with_fmt(0);
+        write32(&mut dev, STRTAB_BASE_CFG, cfg.into());
+        let readback_cfg = StrtabBaseCfg::from(read32(&mut dev, STRTAB_BASE_CFG));
+        assert_eq!(readback_cfg.log2size(), 10);
+        assert_eq!(readback_cfg.fmt(), 0);
+    }
+
+    #[test]
+    fn test_irq_ctrl_ack() {
+        let mut dev = make_test_device();
+
+        let ctrl = IrqCtrl::new()
+            .with_eventq_irqen(true)
+            .with_gerror_irqen(true);
+        write32(&mut dev, IRQ_CTRL, ctrl.into());
+
+        let ack = IrqCtrl::from(read32(&mut dev, IRQ_CTRLACK));
+        assert!(ack.eventq_irqen());
+        assert!(ack.gerror_irqen());
+    }
+
+    #[test]
+    fn test_gbpa_update_bit() {
+        let mut dev = make_test_device();
+
+        // Write GBPA with UPDATE=1 and ABORT=0.
+        let gbpa = Gbpa::new().with_update(true).with_abort(false);
+        write32(&mut dev, GBPA, gbpa.into());
+
+        // Read back: UPDATE should be cleared, ABORT should be 0.
+        let readback = Gbpa::from(read32(&mut dev, GBPA));
+        assert!(!readback.update());
+        assert!(!readback.abort());
+    }
+
+    #[test]
+    fn test_page1_register_access() {
+        let mut dev = make_test_device();
+
+        // EVTQ_CONS on page 1 is guest-writable.
+        write32_page1(&mut dev, EVENTQ_CONS_PAGE1, 42);
+        assert_eq!(read32_page1(&mut dev, EVENTQ_CONS_PAGE1), 42);
+
+        // EVTQ_PROD on page 1 is SMMU-writable only (guest writes ignored).
+        write32_page1(&mut dev, EVENTQ_PROD_PAGE1, 99);
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 0);
+    }
+
+    #[test]
+    fn test_readonly_regs_ignore_writes() {
+        let mut dev = make_test_device();
+
+        let original_idr0 = read32(&mut dev, IDR0);
+        write32(&mut dev, IDR0, 0xDEAD_BEEF);
+        assert_eq!(read32(&mut dev, IDR0), original_idr0);
+
+        let original_aidr = read32(&mut dev, AIDR);
+        write32(&mut dev, AIDR, 0xCAFE);
+        assert_eq!(read32(&mut dev, AIDR), original_aidr);
+
+        // CR0ACK is read-only.
+        write32(&mut dev, CR0ACK, 0xFFFF_FFFF);
+        assert_eq!(read32(&mut dev, CR0ACK), 0);
+
+        // IRQ_CTRLACK is read-only.
+        write32(&mut dev, IRQ_CTRLACK, 0xFFFF_FFFF);
+        assert_eq!(read32(&mut dev, IRQ_CTRLACK), 0);
+    }
+
+    #[test]
+    fn test_cmdq_base_readback() {
+        let mut dev = make_test_device();
+
+        let base = QueueBase::new()
+            .with_log2size(8)
+            .with_addr_bits(0x8000_0000u64 >> 5);
+        write64(&mut dev, CMDQ_BASE, base.into());
+        let readback = QueueBase::from(read64(&mut dev, CMDQ_BASE));
+        assert_eq!(readback.log2size(), 8);
+        assert_eq!(readback.addr(), base.addr());
+    }
+
+    #[test]
+    fn test_evtq_base_readback() {
+        let mut dev = make_test_device();
+
+        let base = QueueBase::new()
+            .with_log2size(8)
+            .with_addr_bits(0xA000_0000u64 >> 5);
+        write64(&mut dev, EVENTQ_BASE, base.into());
+        let readback = QueueBase::from(read64(&mut dev, EVENTQ_BASE));
+        assert_eq!(readback.log2size(), 8);
+        assert_eq!(readback.addr(), base.addr());
+    }
+
+    #[test]
+    fn test_gerror_gerrorn_toggle() {
+        let mut dev = make_test_device();
+
+        // Initially GERROR = GERRORN = 0 (no active errors).
+        assert_eq!(read32(&mut dev, GERROR), 0);
+        assert_eq!(read32(&mut dev, GERRORN), 0);
+
+        // Toggle CMDQ_ERR via shared state (as the emulator would).
+        dev.shared_state.toggle_cmdq_err();
+        let gerror = Gerror::from(read32(&mut dev, GERROR));
+        assert!(gerror.cmdq_err());
+
+        // Guest acknowledges by writing GERRORN to match GERROR.
+        write32(&mut dev, GERRORN, gerror.into());
+        let gerrorn = Gerror::from(read32(&mut dev, GERRORN));
+        assert!(gerrorn.cmdq_err());
+    }
+
+    #[test]
+    fn test_msi_config_registers() {
+        let mut dev = make_test_device();
+
+        // GERROR MSI config (page 0).
+        write64(&mut dev, GERROR_IRQ_CFG0, 0xFEDC_BA98_7654_3210);
+        assert_eq!(read64(&mut dev, GERROR_IRQ_CFG0), 0xFEDC_BA98_7654_3210);
+        write32(&mut dev, GERROR_IRQ_CFG1, 0xAABB_CCDD);
+        assert_eq!(read32(&mut dev, GERROR_IRQ_CFG1), 0xAABB_CCDD);
+        write32(&mut dev, GERROR_IRQ_CFG2, 0x0000_000F);
+        assert_eq!(read32(&mut dev, GERROR_IRQ_CFG2), 0x0000_000F);
+
+        // EVENTQ MSI config (page 0).
+        write64(&mut dev, EVENTQ_IRQ_CFG0, 0x1111_2222_3333_4444);
+        assert_eq!(read64(&mut dev, EVENTQ_IRQ_CFG0), 0x1111_2222_3333_4444);
+        write32(&mut dev, EVENTQ_IRQ_CFG1, 0x5555_6666);
+        assert_eq!(read32(&mut dev, EVENTQ_IRQ_CFG1), 0x5555_6666);
+        write32(&mut dev, EVENTQ_IRQ_CFG2, 0x0000_0003);
+        assert_eq!(read32(&mut dev, EVENTQ_IRQ_CFG2), 0x0000_0003);
+
+        // CMDQ MSI config (page 1).
+        write64_page1(&mut dev, CMDQ_IRQ_CFG0_PAGE1, 0xAAAA_BBBB_CCCC_DDDD);
+        assert_eq!(
+            read64_page1(&mut dev, CMDQ_IRQ_CFG0_PAGE1),
+            0xAAAA_BBBB_CCCC_DDDD
+        );
+        write32_page1(&mut dev, CMDQ_IRQ_CFG1_PAGE1, 0x1234_5678);
+        assert_eq!(read32_page1(&mut dev, CMDQ_IRQ_CFG1_PAGE1), 0x1234_5678);
+        write32_page1(&mut dev, CMDQ_IRQ_CFG2_PAGE1, 0x0000_0007);
+        assert_eq!(read32_page1(&mut dev, CMDQ_IRQ_CFG2_PAGE1), 0x0000_0007);
+    }
+
+    #[test]
+    fn test_invalid_access_size() {
+        let mut dev = make_test_device();
+
+        // 1-byte read should fail.
+        let mut data = [0u8; 1];
+        let result = dev.mmio_read(TEST_MMIO_BASE, &mut data);
+        assert!(matches!(result, IoResult::Err(IoError::InvalidAccessSize)));
+
+        // 1-byte write should fail.
+        let result = dev.mmio_write(TEST_MMIO_BASE, &[0u8]);
+        assert!(matches!(result, IoResult::Err(IoError::InvalidAccessSize)));
+
+        // 3-byte read should fail.
+        let mut data = [0u8; 3];
+        let result = dev.mmio_read(TEST_MMIO_BASE, &mut data);
+        assert!(matches!(result, IoResult::Err(IoError::InvalidAccessSize)));
+    }
+
+    #[test]
+    fn test_cr1_cr2_readback() {
+        let mut dev = make_test_device();
+
+        let cr1 = Cr1::new()
+            .with_queue_ic(0b01)
+            .with_queue_oc(0b01)
+            .with_queue_sh(0b11)
+            .with_table_ic(0b01)
+            .with_table_oc(0b01)
+            .with_table_sh(0b11);
+        write32(&mut dev, CR1, cr1.into());
+        let readback = Cr1::from(read32(&mut dev, CR1));
+        assert_eq!(readback.queue_ic(), 0b01);
+        assert_eq!(readback.table_sh(), 0b11);
+
+        let cr2 = Cr2::new().with_recinvsid(true);
+        write32(&mut dev, CR2, cr2.into());
+        let readback = Cr2::from(read32(&mut dev, CR2));
+        assert!(readback.recinvsid());
+    }
+
+    #[test]
+    fn test_cmdq_prod_readback() {
+        let mut dev = make_test_device();
+
+        write32(&mut dev, CMDQ_PROD, 0x0000_0005);
+        assert_eq!(read32(&mut dev, CMDQ_PROD), 0x0000_0005);
+    }
+
+    // =========================================================================
+    // CMDQ processing tests
+    // =========================================================================
+
+    /// Size of the test CMDQ: 2^3 = 8 entries.
+    const TEST_CMDQ_LOG2SIZE: u8 = 3;
+    /// GPA where the test CMDQ lives.
+    const TEST_CMDQ_GPA: u64 = 0x1_0000;
+    /// GPA where CMD_SYNC MSI writes go.
+    const TEST_MSI_GPA: u64 = 0x2_0000;
+
+    /// Create a device with real guest memory and a configured CMDQ.
+    fn make_cmdq_test_device() -> SmmuDevice {
+        // Allocate enough guest memory for CMDQ + MSI target page.
+        let gm = GuestMemory::allocate(0x4_0000);
+        let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None);
+
+        // Program CMDQ_BASE: address + log2size.
+        let cmdq_base = QueueBase::new()
+            .with_log2size(TEST_CMDQ_LOG2SIZE)
+            .with_addr_bits(TEST_CMDQ_GPA >> 5);
+        write64(&mut dev, CMDQ_BASE, cmdq_base.into());
+
+        // Enable CMDQEN.
+        let cr0 = Cr0::new().with_cmdqen(true);
+        write32(&mut dev, CR0, cr0.into());
+
+        dev
+    }
+
+    /// Write a command entry to the CMDQ at the given index.
+    fn write_cmdq_entry(dev: &SmmuDevice, index: u32, entry: &CmdEntry) {
+        let addr = TEST_CMDQ_GPA + (index as u64) * (size_of::<CmdEntry>() as u64);
+        dev.guest_memory
+            .write_plain(addr, entry)
+            .expect("write cmd entry");
+    }
+
+    #[test]
+    fn test_cmdq_basic_consumption() {
+        let mut dev = make_cmdq_test_device();
+
+        // Write 3 commands: CFGI_STE_RANGE (CFGI_ALL), TLBI_NSNH_ALL, CMD_SYNC(SEV).
+        write_cmdq_entry(
+            &dev,
+            0,
+            &CmdEntry {
+                qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64,
+                qw1: 31, // Range=31 = ALL
+            },
+        );
+        write_cmdq_entry(
+            &dev,
+            1,
+            &CmdEntry {
+                qw0: CmdOpcode::TLBI_NSNH_ALL.0 as u64,
+                qw1: 0,
+            },
+        );
+        let sync = CmdSync::new()
+            .with_opcode(CmdOpcode::CMD_SYNC.0)
+            .with_cs(SyncCs::SIG_SEV.0);
+        write_cmdq_entry(
+            &dev,
+            2,
+            &CmdEntry {
+                qw0: sync.into(),
+                qw1: 0,
+            },
+        );
+
+        // Set PROD=3, triggering processing.
+        write32(&mut dev, CMDQ_PROD, 3);
+
+        // Verify CONS=3.
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 3);
+        assert_eq!(cons.err(), 0);
+    }
+
+    #[test]
+    fn test_cmdq_sync_msi_write() {
+        let mut dev = make_cmdq_test_device();
+
+        let msi_data: u32 = 0xDEAD_BEEF;
+        let msi_addr: u64 = TEST_MSI_GPA;
+
+        // Build CMD_SYNC with CS=SIG_IRQ and MSI address/data.
+        let sync = CmdSync::new()
+            .with_opcode(CmdOpcode::CMD_SYNC.0)
+            .with_cs(SyncCs::SIG_IRQ.0)
+            .with_msi_data(msi_data);
+        // MSI address goes in qw1 bits [119:66] → addr[55:2] at bits [53:0]
+        // shifted left by 2 in qw1.
+        let qw1 = (msi_addr >> 2) << 2;
+        write_cmdq_entry(
+            &dev,
+            0,
+            &CmdEntry {
+                qw0: sync.into(),
+                qw1,
+            },
+        );
+
+        // Set PROD=1.
+        write32(&mut dev, CMDQ_PROD, 1);
+
+        // Verify CONS=1.
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 1);
+
+        // Verify MSI data written to the target GPA.
+        let written: u32 = dev
+            .guest_memory
+            .read_plain(msi_addr)
+            .expect("read MSI data");
+        assert_eq!(written, msi_data);
+    }
+
+    #[test]
+    fn test_cmdq_wrap() {
+        let mut dev = make_cmdq_test_device();
+
+        let max_entries = 1u32 << TEST_CMDQ_LOG2SIZE; // 8
+
+        // Fill the queue completely: 8 CFGI_STE_RANGE commands.
+        for i in 0..max_entries {
+            write_cmdq_entry(
+                &dev,
+                i,
+                &CmdEntry {
+                    qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64,
+                    qw1: 31,
+                },
+            );
+        }
+
+        // Set PROD = 8 (which with wrap bit means index 0 with wrap=1).
+        write32(&mut dev, CMDQ_PROD, max_entries);
+
+        // CONS should advance to 8 (matching PROD with wrap).
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), max_entries);
+        assert_eq!(cons.err(), 0);
+
+        // Now write one more command at index 0 (wrapping around).
+        write_cmdq_entry(
+            &dev,
+            0,
+            &CmdEntry {
+                qw0: CmdOpcode::TLBI_NH_ALL.0 as u64,
+                qw1: 0,
+            },
+        );
+
+        // PROD = 9 (wrap bit set, index 1).
+        write32(&mut dev, CMDQ_PROD, max_entries + 1);
+
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), max_entries + 1);
+    }
+
+    #[test]
+    fn test_cmdq_unknown_opcode() {
+        let mut dev = make_cmdq_test_device();
+
+        // Write a command with unknown opcode 0xFF.
+        write_cmdq_entry(&dev, 0, &CmdEntry { qw0: 0xFF, qw1: 0 });
+
+        write32(&mut dev, CMDQ_PROD, 1);
+
+        // CONS should have CERROR_ILL in the error field.
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.err(), CmdqError::CERROR_ILL.0);
+
+        // GERROR.CMDQ_ERR should be toggled (was 0, now 1).
+        let gerror = Gerror::from(read32(&mut dev, GERROR));
+        assert!(gerror.cmdq_err());
+    }
+
+    #[test]
+    fn test_cmdq_log2size_clamped_to_idr1() {
+        let gm = GuestMemory::allocate(0x4_0000);
+        let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None);
+
+        // IDR1.CMDQS = 8, IDR1.EVENTQS = 8. Program a larger value (20).
+        let cmdq_base = QueueBase::new()
+            .with_log2size(20)
+            .with_addr_bits(TEST_CMDQ_GPA >> 5);
+        write64(&mut dev, CMDQ_BASE, cmdq_base.into());
+
+        // The effective log2size should be clamped to 8.
+        assert_eq!(dev.cmdq_log2size(), 8);
+
+        // A value within the limit should pass through unchanged.
+        let cmdq_base = QueueBase::new()
+            .with_log2size(5)
+            .with_addr_bits(TEST_CMDQ_GPA >> 5);
+        write64(&mut dev, CMDQ_BASE, cmdq_base.into());
+        assert_eq!(dev.cmdq_log2size(), 5);
+    }
+
+    #[test]
+    fn test_cmdq_linux_reset_sequence() {
+        let mut dev = make_cmdq_test_device();
+
+        // Linux reset sequence: CFGI_ALL + CMD_SYNC, TLBI_NSNH_ALL + CMD_SYNC.
+        // Step 1: CFGI_ALL (CFGI_STE_RANGE with Range=31) + CMD_SYNC(SEV).
+        write_cmdq_entry(
+            &dev,
+            0,
+            &CmdEntry {
+                qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64,
+                qw1: 31,
+            },
+        );
+        let sync = CmdSync::new()
+            .with_opcode(CmdOpcode::CMD_SYNC.0)
+            .with_cs(SyncCs::SIG_SEV.0);
+        write_cmdq_entry(
+            &dev,
+            1,
+            &CmdEntry {
+                qw0: sync.into(),
+                qw1: 0,
+            },
+        );
+        write32(&mut dev, CMDQ_PROD, 2);
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 2);
+        assert_eq!(cons.err(), 0);
+
+        // Step 2: TLBI_NSNH_ALL + CMD_SYNC(SEV).
+        write_cmdq_entry(
+            &dev,
+            2,
+            &CmdEntry {
+                qw0: CmdOpcode::TLBI_NSNH_ALL.0 as u64,
+                qw1: 0,
+            },
+        );
+        write_cmdq_entry(
+            &dev,
+            3,
+            &CmdEntry {
+                qw0: sync.into(),
+                qw1: 0,
+            },
+        );
+        write32(&mut dev, CMDQ_PROD, 4);
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 4);
+        assert_eq!(cons.err(), 0);
+
+        // No errors should be set.
+        let gerror = Gerror::from(read32(&mut dev, GERROR));
+        assert!(!gerror.cmdq_err());
+    }
+
+    #[test]
+    fn test_cmdq_error_stops_processing() {
+        let mut dev = make_cmdq_test_device();
+
+        // Write: unknown opcode, then a valid command.
+        write_cmdq_entry(
+            &dev,
+            0,
+            &CmdEntry {
+                qw0: 0xFF, // Unknown
+                qw1: 0,
+            },
+        );
+        write_cmdq_entry(
+            &dev,
+            1,
+            &CmdEntry {
+                qw0: CmdOpcode::TLBI_NH_ALL.0 as u64,
+                qw1: 0,
+            },
+        );
+
+        write32(&mut dev, CMDQ_PROD, 2);
+
+        // CONS should be at 0 — processing stopped at the unknown command.
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 0);
+        assert_eq!(cons.err(), CmdqError::CERROR_ILL.0);
+
+        // Even if we write more PROD, processing should not resume (error active).
+        write32(&mut dev, CMDQ_PROD, 2);
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 0);
+
+        // Acknowledge the error by writing GERRORN to match GERROR.
+        let gerror = read32(&mut dev, GERROR);
+        write32(&mut dev, GERRORN, gerror);
+
+        // Clear the error in CMDQ_CONS by resetting it internally.
+        // In practice, the guest would reprogram CMDQ_BASE and re-enable,
+        // but for this test we just verify the error flag blocks processing.
+    }
+
+    #[test]
+    fn test_cmdq_disabled() {
+        // Create device but do NOT enable CMDQEN.
+        let gm = GuestMemory::allocate(0x4_0000);
+        let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None);
+
+        let cmdq_base = QueueBase::new()
+            .with_log2size(TEST_CMDQ_LOG2SIZE)
+            .with_addr_bits(TEST_CMDQ_GPA >> 5);
+        write64(&mut dev, CMDQ_BASE, cmdq_base.into());
+
+        // Write a command and set PROD without enabling CMDQEN.
+        write_cmdq_entry(
+            &dev,
+            0,
+            &CmdEntry {
+                qw0: CmdOpcode::TLBI_NH_ALL.0 as u64,
+                qw1: 0,
+            },
+        );
+        write32(&mut dev, CMDQ_PROD, 1);
+
+        // CONS should stay at 0 — CMDQ is disabled.
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), 0);
+    }
+
+    // =========================================================================
+    // EVTQ tests
+    // =========================================================================
+
+    /// Size of the test EVTQ: 2^3 = 8 entries.
+    const TEST_EVTQ_LOG2SIZE: u8 = 3;
+    /// GPA where the test EVTQ lives.
+    const TEST_EVTQ_GPA: u64 = 0x3_0000;
+    /// GPA where the EVTQ MSI writes go.
+    const TEST_EVTQ_MSI_GPA: u64 = 0x2_0100;
+
+    /// Create a device with EVTQ configured and enabled.
+    fn make_evtq_test_device() -> SmmuDevice {
+        let gm = GuestMemory::allocate(0x4_0000);
+        let mut dev = SmmuDevice::new(TEST_MMIO_BASE, gm, &SmmuConfig::default(), None, None);
+
+        // Program EVTQ_BASE.
+        let evtq_base = QueueBase::new()
+            .with_log2size(TEST_EVTQ_LOG2SIZE)
+            .with_addr_bits(TEST_EVTQ_GPA >> 5);
+        write64(&mut dev, EVENTQ_BASE, evtq_base.into());
+
+        // Program EVTQ MSI config.
+        write64(&mut dev, EVENTQ_IRQ_CFG0, TEST_EVTQ_MSI_GPA);
+        write32(&mut dev, EVENTQ_IRQ_CFG1, 0xBEEF);
+
+        // Enable EVTQEN + EVENTQ_IRQEN.
+        let cr0 = Cr0::new().with_eventqen(true);
+        write32(&mut dev, CR0, cr0.into());
+        let irq_ctrl = IrqCtrl::new().with_eventq_irqen(true);
+        write32(&mut dev, IRQ_CTRL, irq_ctrl.into());
+
+        dev
+    }
+
+    #[test]
+    fn test_evtq_write_and_read() {
+        let mut dev = make_evtq_test_device();
+
+        let event = EvtEntry::translation_fault(42, 0x1000_0000, false);
+        dev.shared_state().write_event(event);
+
+        // EVTQ_PROD should advance to 1.
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 1);
+
+        // Read the event record from guest memory.
+        let written: EvtEntry = dev
+            .guest_memory
+            .read_plain(TEST_EVTQ_GPA)
+            .expect("read event");
+        assert_eq!(
+            written.event_id(),
+            crate::spec::events::EventId::F_TRANSLATION
+        );
+        assert_eq!(written.sid, 42);
+        assert_eq!(written.input_addr, 0x1000_0000);
+        assert!(written.flags.rnw()); // read (rnw=true because write=false)
+    }
+
+    #[test]
+    fn test_evtq_write_advances_prod() {
+        let mut dev = make_evtq_test_device();
+
+        // Write two events and verify PROD advances each time.
+        let event1 = EvtEntry::translation_fault(1, 0x2000, true);
+        dev.shared_state().write_event(event1);
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 1);
+
+        let event2 = EvtEntry::translation_fault(2, 0x3000, false);
+        dev.shared_state().write_event(event2);
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), 2);
+
+        // Verify both events are in guest memory.
+        let e1: EvtEntry = dev.guest_memory.read_plain(TEST_EVTQ_GPA).expect("read");
+        assert_eq!(e1.sid, 1);
+        let e2: EvtEntry = dev
+            .guest_memory
+            .read_plain(TEST_EVTQ_GPA + EvtEntry::SIZE as u64)
+            .expect("read");
+        assert_eq!(e2.sid, 2);
+    }
+
+    #[test]
+    fn test_evtq_full() {
+        let mut dev = make_evtq_test_device();
+
+        let max_entries = 1u32 << TEST_EVTQ_LOG2SIZE; // 8
+        for i in 0..max_entries {
+            let event = EvtEntry::translation_fault(i, 0x1000 * i as u64, false);
+            dev.shared_state().write_event(event);
+        }
+
+        // Queue should be full now. PROD = 8 (wrap), CONS = 0.
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), max_entries);
+
+        // Writing one more should be dropped (queue full).
+        let event = EvtEntry::translation_fault(99, 0xDEAD, false);
+        dev.shared_state().write_event(event);
+
+        // PROD should NOT advance (event dropped).
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), max_entries);
+    }
+
+    #[test]
+    fn test_evtq_cons_frees_space() {
+        let mut dev = make_evtq_test_device();
+
+        let max_entries = 1u32 << TEST_EVTQ_LOG2SIZE; // 8
+        for i in 0..max_entries {
+            let event = EvtEntry::translation_fault(i, 0x1000 * i as u64, false);
+            dev.shared_state().write_event(event);
+        }
+
+        // Queue is full. Advance CONS to consume 3 entries.
+        write32_page1(&mut dev, EVENTQ_CONS_PAGE1, 3);
+
+        // Should be able to write 3 more events.
+        for i in 0..3u32 {
+            let event = EvtEntry::translation_fault(100 + i, 0xF000, false);
+            dev.shared_state().write_event(event);
+        }
+
+        // PROD should now be at 7 + 3 = 10 (with wrap).
+        assert_eq!(read32_page1(&mut dev, EVENTQ_PROD_PAGE1), max_entries + 3);
+    }
+
+    // =========================================================================
+    // Sub-phase 1J: End-to-End Integration Test
+    // =========================================================================
+
+    /// End-to-end test that exercises the full SMMU stack:
+    /// MMIO register programming → command queue → stream table → context
+    /// descriptor → page table walk → translated DMA read/write → MSI
+    /// translation.
+    ///
+    /// Mimics the Linux SMMUv3 driver initialization sequence:
+    /// 1. Probe: read IDR registers, verify feature bits.
+    /// 2. Reset: disable SMMU, program CR1, stream table, queues, enable.
+    /// 3. Attach: configure STE and CD for a device.
+    /// 4. DMA: read/write through SmmuTranslatingMemory.
+    /// 5. MSI: fire MSI through SmmuSignalMsi with translated address.
+    /// 6. Fault: access unmapped IOVA, verify EVTQ event.
+    #[test]
+    fn test_end_to_end_linux_driver_sequence() {
+        use crate::spec::cd::Cd;
+        use crate::spec::cd::CdDw0;
+        use crate::spec::cd::CdDw1;
+        use crate::spec::cd::Ips;
+        use crate::spec::cd::Tg0;
+        use crate::spec::commands::CmdCfgiCd;
+        use crate::spec::commands::CmdCfgiSte;
+        use crate::spec::commands::CmdCfgiSteRange;
+        use crate::spec::commands::CmdOpcode;
+        use crate::spec::commands::CmdSync;
+        use crate::spec::commands::SyncCs;
+        use crate::spec::events::EventId;
+        use crate::spec::pt::ApBits;
+        use crate::spec::pt::PtDesc;
+        use crate::spec::ste::STE_SIZE;
+        use crate::spec::ste::Ste;
+        use crate::spec::ste::SteConfig;
+        use crate::spec::ste::SteDw0;
+        use crate::spec::ste::SteDw1;
+        use parking_lot::Mutex;
+        use pci_core::bus_range::AssignedBusRange;
+        use pci_core::msi::SignalMsi;
+        use std::sync::Arc;
+
+        // =====================================================================
+        // Memory layout constants
+        // =====================================================================
+
+        const STRTAB_GPA: u64 = 0x10_0000; // Stream table
+        const STRTAB_LOG2SIZE: u8 = 10; // 1024 entries
+        const CMDQ_GPA: u64 = 0x20_0000; // Command queue
+        const CMDQ_LOG2SIZE: u8 = 5; // 32 entries
+        const EVTQ_GPA: u64 = 0x30_0000; // Event queue
+        const EVTQ_LOG2SIZE: u8 = 5; // 32 entries
+        const CD_GPA: u64 = 0x40_0000; // Context descriptor table
+        const PT_L1_GPA: u64 = 0x50_1000; // L1 page table
+        const PT_L2_GPA: u64 = 0x50_2000; // L2 page table
+        const PT_L3_GPA: u64 = 0x50_3000; // L3 page table
+        const DATA_GPA: u64 = 0x6000_0000; // Translated target page
+        const DOORBELL_GPA: u64 = 0x7000_0000; // MSI doorbell physical page
+        const SYNC_MSI_GPA: u64 = 0x80_0000; // CMD_SYNC MSI target
+        const EVTQ_MSI_GPA: u64 = 0x80_0100; // EVTQ MSI target
+
+        // IOVA space layout (guest-programmed)
+        const DMA_IOVA: u64 = 0x0000_0000; // Maps to DATA_GPA
+        const DOORBELL_IOVA: u64 = 0x0800_0000; // Maps to DOORBELL_GPA
+
+        // Device identity
+        const SEGMENT: u16 = 0;
+        const BUS: u8 = 1;
+        const STREAM_ID_BASE: u32 = (SEGMENT as u32) << 16;
+        const STREAM_ID: u32 = STREAM_ID_BASE + ((BUS as u32) << 8);
+
+        // =====================================================================
+        // Mock MSI target
+        // =====================================================================
+
+        struct MockSignalMsi {
+            calls: Mutex<Vec<(Option<u32>, u64, u32)>>,
+        }
+
+        impl MockSignalMsi {
+            fn new() -> Arc<Self> {
+                Arc::new(Self {
+                    calls: Mutex::new(Vec::new()),
+                })
+            }
+
+            fn take_calls(&self) -> Vec<(Option<u32>, u64, u32)> {
+                std::mem::take(&mut *self.calls.lock())
+            }
+        }
+
+        impl SignalMsi for MockSignalMsi {
+            fn signal_msi(&self, devid: Option<u32>, address: u64, data: u32) {
+                self.calls.lock().push((devid, address, data));
+            }
+        }
+
+        // Helper to write a command entry to the CMDQ at a given index.
+        fn write_cmd(gm: &GuestMemory, index: u32, entry: &CmdEntry) {
+            let addr = CMDQ_GPA + (index as u64) * (size_of::<CmdEntry>() as u64);
+            gm.write_plain(addr, entry).expect("write cmd entry");
+        }
+
+        // =====================================================================
+        // Allocate guest memory and create device
+        // =====================================================================
+
+        let gm = GuestMemory::allocate(0x8000_0000); // 2 GiB
+        let mut dev = SmmuDevice::new(
+            TEST_MMIO_BASE,
+            gm.clone(),
+            &SmmuConfig::default(),
+            None,
+            None,
+        );
+
+        // =====================================================================
+        // Step 1: Probe — read IDR registers (arm_smmu_device_hw_probe)
+        // =====================================================================
+
+        let idr0 = Idr0::from(read32(&mut dev, IDR0));
+        assert!(idr0.s1p(), "S1 translation must be supported");
+        assert_eq!(idr0.ttf(), 0b10, "TTF must include AArch64");
+        assert!(!idr0.msi(), "MSI must not be advertised (wired SPIs)");
+        assert_eq!(idr0.ttendian(), 0b10, "Must be little-endian");
+        assert_eq!(idr0.st_level(), 0b00, "Must be linear stream table");
+
+        let idr1 = Idr1::from(read32(&mut dev, IDR1));
+        assert_eq!(idr1.sidsize(), 16);
+        assert!(idr1.cmdqs() >= 5, "CMDQS must support our queue size");
+
+        let idr5 = Idr5::from(read32(&mut dev, IDR5));
+        assert!(idr5.gran4k(), "4K granule must be supported");
+
+        // =====================================================================
+        // Step 2: Reset — arm_smmu_device_reset() sequence
+        // =====================================================================
+
+        // 2a. Disable SMMU.
+        write32(&mut dev, CR0, 0);
+        assert_eq!(
+            read32(&mut dev, CR0ACK),
+            0,
+            "CR0ACK must reflect disabled state"
+        );
+
+        // 2b. Program CR1 (memory attributes for table walks).
+        let cr1 = Cr1::new()
+            .with_table_sh(0b11) // Inner shareable
+            .with_table_oc(0b01) // Write-back
+            .with_table_ic(0b01) // Write-back
+            .with_queue_sh(0b11)
+            .with_queue_oc(0b01)
+            .with_queue_ic(0b01);
+        write32(&mut dev, CR1, cr1.into());
+
+        // 2c. Program stream table base.
+        let strtab_base = StrtabBase::new().with_addr_bits(STRTAB_GPA >> 6);
+        write64(&mut dev, STRTAB_BASE, strtab_base.into());
+        let strtab_cfg = StrtabBaseCfg::new()
+            .with_log2size(STRTAB_LOG2SIZE)
+            .with_fmt(0); // Linear
+        write32(&mut dev, STRTAB_BASE_CFG, strtab_cfg.into());
+
+        // Verify readback.
+        assert_eq!(
+            StrtabBase::from(read64(&mut dev, STRTAB_BASE)).addr(),
+            STRTAB_GPA
+        );
+
+        // 2d. Program CMDQ.
+        let cmdq_base = QueueBase::new()
+            .with_log2size(CMDQ_LOG2SIZE)
+            .with_addr_bits(CMDQ_GPA >> 5);
+        write64(&mut dev, CMDQ_BASE, cmdq_base.into());
+        write32(&mut dev, CMDQ_PROD, 0);
+        // CMDQ_CONS is SMMU-writable only; starts at 0.
+
+        // 2e. Enable CMDQEN.
+        let cr0_cmdqen = Cr0::new().with_cmdqen(true);
+        write32(&mut dev, CR0, cr0_cmdqen.into());
+        assert_eq!(
+            Cr0::from(read32(&mut dev, CR0ACK)).cmdqen(),
+            true,
+            "CMDQEN must be acknowledged"
+        );
+
+        // 2f. Issue CFGI_ALL + CMD_SYNC (invalidate all cached STEs).
+        let mut cmd_idx: u32 = 0;
+
+        let cfgi_all = CmdEntry {
+            qw0: CmdCfgiSteRange::new()
+                .with_opcode(CmdOpcode::CFGI_STE_RANGE.0)
+                .into(),
+            qw1: CmdCfgiSteRange::RANGE_ALL as u64,
+        };
+        write_cmd(&gm, cmd_idx, &cfgi_all);
+        cmd_idx += 1;
+
+        let sync0 = CmdEntry {
+            qw0: CmdSync::new()
+                .with_opcode(CmdOpcode::CMD_SYNC.0)
+                .with_cs(SyncCs::SIG_IRQ.0)
+                .with_msi_data(0xAAAA)
+                .into(),
+            qw1: (SYNC_MSI_GPA >> 2) << 2,
+        };
+        write_cmd(&gm, cmd_idx, &sync0);
+        cmd_idx += 1;
+
+        write32(&mut dev, CMDQ_PROD, cmd_idx);
+
+        // Verify CONS advanced.
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), cmd_idx, "CMDQ_CONS must advance to PROD");
+
+        // Verify CMD_SYNC MSI written.
+        let sync_val: u32 = gm.read_plain(SYNC_MSI_GPA).expect("read sync MSI");
+        assert_eq!(sync_val, 0xAAAA, "CMD_SYNC MSI data must match");
+
+        // 2g. Issue TLBI_NSNH_ALL + CMD_SYNC.
+        let tlbi_all = CmdEntry {
+            qw0: CmdOpcode::TLBI_NSNH_ALL.0 as u64,
+            qw1: 0,
+        };
+        write_cmd(&gm, cmd_idx, &tlbi_all);
+        cmd_idx += 1;
+
+        // Reset sync target.
+        gm.write_at(SYNC_MSI_GPA, &0u32.to_le_bytes()).unwrap();
+
+        let sync1 = CmdEntry {
+            qw0: CmdSync::new()
+                .with_opcode(CmdOpcode::CMD_SYNC.0)
+                .with_cs(SyncCs::SIG_IRQ.0)
+                .with_msi_data(0xBBBB)
+                .into(),
+            qw1: (SYNC_MSI_GPA >> 2) << 2,
+        };
+        write_cmd(&gm, cmd_idx, &sync1);
+        cmd_idx += 1;
+
+        write32(&mut dev, CMDQ_PROD, cmd_idx);
+
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), cmd_idx);
+        let sync_val: u32 = gm.read_plain(SYNC_MSI_GPA).expect("read sync MSI");
+        assert_eq!(sync_val, 0xBBBB);
+
+        // 2h. Program EVTQ.
+        let evtq_base = QueueBase::new()
+            .with_log2size(EVTQ_LOG2SIZE)
+            .with_addr_bits(EVTQ_GPA >> 5);
+        write64(&mut dev, EVENTQ_BASE, evtq_base.into());
+
+        // Program EVTQ MSI config.
+        write64(&mut dev, EVENTQ_IRQ_CFG0, EVTQ_MSI_GPA);
+        write32(&mut dev, EVENTQ_IRQ_CFG1, 0xDEAD);
+
+        // 2i. Enable EVTQEN.
+        let cr0_evtqen = Cr0::new().with_cmdqen(true).with_eventqen(true);
+        write32(&mut dev, CR0, cr0_evtqen.into());
+        assert!(Cr0::from(read32(&mut dev, CR0ACK)).eventqen());
+
+        // 2j. Enable EVENTQ IRQ.
+        let irq_ctrl = IrqCtrl::new().with_eventq_irqen(true);
+        write32(&mut dev, IRQ_CTRL, irq_ctrl.into());
+        assert!(IrqCtrl::from(read32(&mut dev, IRQ_CTRLACK)).eventq_irqen());
+
+        // 2k. Enable SMMUEN.
+        let cr0_full = Cr0::new()
+            .with_cmdqen(true)
+            .with_eventqen(true)
+            .with_smmuen(true);
+        write32(&mut dev, CR0, cr0_full.into());
+        let cr0ack = Cr0::from(read32(&mut dev, CR0ACK));
+        assert!(cr0ack.smmuen(), "SMMUEN must be acknowledged");
+        assert!(cr0ack.cmdqen());
+        assert!(cr0ack.eventqen());
+
+        // =====================================================================
+        // Step 3: Attach device — configure STE and CD for stream ID
+        // =====================================================================
+
+        // 3a. Write STE: S1_TRANS mode, point to CD table at CD_GPA.
+        let ste = Ste {
+            qw0: SteDw0::new()
+                .with_v(true)
+                .with_config(SteConfig::S1_TRANS.0)
+                .with_s1_context_ptr(CD_GPA >> 6)
+                .with_s1_cd_max(0), // Single CD (SSID=0 only)
+            qw1: SteDw1::new(),
+            _qw2_7: [0u64; 6],
+        };
+        let ste_addr = STRTAB_GPA + (STREAM_ID as u64) * (STE_SIZE as u64);
+        gm.write_plain(ste_addr, &ste).expect("write STE");
+
+        // 3b. Write CD: TTB0 = PT_L1_GPA, T0SZ=32 (32-bit VA), 4K granule, 40-bit OAS.
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_v(true)
+                .with_t0sz(32)
+                .with_tg0(Tg0::GRAN_4K.0)
+                .with_ips(Ips::IPS_40.0)
+                .with_aa64(true)
+                .with_asid(1),
+            qw1: CdDw1::new().with_ttb0(PT_L1_GPA >> 4),
+            _qw2: 0,
+            mair0: 0xFF440C0400,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        let cd_addr = CD_GPA; // SSID=0
+        gm.write_plain(cd_addr, &cd).expect("write CD");
+
+        // 3c. Build page table hierarchy for DMA region:
+        //     IOVA 0x0000_0000..0x0000_0FFF → DATA_GPA
+        //     T0SZ=32, 4K granule → 3-level walk (L1, L2, L3).
+        //
+        // L1[0] → L2 table
+        let l1_desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(PT_L2_GPA >> 12);
+        gm.write_plain::<u64>(PT_L1_GPA, &l1_desc.into())
+            .expect("write L1");
+
+        // L2[0] → L3 table
+        let l2_desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(PT_L3_GPA >> 12);
+        gm.write_plain::<u64>(PT_L2_GPA, &l2_desc.into())
+            .expect("write L2");
+
+        // L3[0] → page at DATA_GPA (RW, AF=1)
+        let l3_desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true) // L3: type=1 means page
+            .with_af(true)
+            .with_ap(ApBits::RW_EL1.0)
+            .with_addr_bits(DATA_GPA >> 12);
+        gm.write_plain::<u64>(PT_L3_GPA, &l3_desc.into())
+            .expect("write L3[0]");
+
+        // 3d. Build page table for doorbell region (for MSI translation):
+        //     IOVA 0x0800_0000 → DOORBELL_GPA
+        //     L1 index = 0x0800_0000 >> 30 = 0 (same L1 entry)
+        //     L2 index = (0x0800_0000 >> 21) & 0x1FF = 64
+        //     L3 index = (0x0800_0000 >> 12) & 0x1FF = 0
+        //
+        // We need a separate L2→L3 chain for L2[64].
+        const DOORBELL_PT_L3_GPA: u64 = 0x50_4000;
+
+        // L2[64] → doorbell L3 table
+        let l2_doorbell_desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(DOORBELL_PT_L3_GPA >> 12);
+        let l2_doorbell_offset = 64 * 8; // L2 index 64, 8 bytes per entry
+        gm.write_plain::<u64>(PT_L2_GPA + l2_doorbell_offset, &l2_doorbell_desc.into())
+            .expect("write L2[64]");
+
+        // Doorbell L3[0] → page at DOORBELL_GPA
+        let l3_doorbell_desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_af(true)
+            .with_ap(ApBits::RW_EL1.0)
+            .with_addr_bits(DOORBELL_GPA >> 12);
+        gm.write_plain::<u64>(DOORBELL_PT_L3_GPA, &l3_doorbell_desc.into())
+            .expect("write doorbell L3[0]");
+
+        // 3e. Issue CFGI_STE + CFGI_CD + CMD_SYNC via CMDQ.
+        let cfgi_ste = CmdEntry {
+            qw0: CmdCfgiSte::new()
+                .with_opcode(CmdOpcode::CFGI_STE.0)
+                .with_sid(STREAM_ID)
+                .into(),
+            qw1: 0,
+        };
+        write_cmd(&gm, cmd_idx, &cfgi_ste);
+        cmd_idx += 1;
+
+        let cfgi_cd = CmdEntry {
+            qw0: CmdCfgiCd::new()
+                .with_opcode(CmdOpcode::CFGI_CD.0)
+                .with_sid(STREAM_ID)
+                .with_ssid(0)
+                .into(),
+            qw1: 0,
+        };
+        write_cmd(&gm, cmd_idx, &cfgi_cd);
+        cmd_idx += 1;
+
+        // Reset sync target.
+        gm.write_at(SYNC_MSI_GPA, &0u32.to_le_bytes()).unwrap();
+
+        let sync2 = CmdEntry {
+            qw0: CmdSync::new()
+                .with_opcode(CmdOpcode::CMD_SYNC.0)
+                .with_cs(SyncCs::SIG_IRQ.0)
+                .with_msi_data(0xCCCC)
+                .into(),
+            qw1: (SYNC_MSI_GPA >> 2) << 2,
+        };
+        write_cmd(&gm, cmd_idx, &sync2);
+        cmd_idx += 1;
+
+        write32(&mut dev, CMDQ_PROD, cmd_idx);
+
+        let cons = CmdqCons::from(read32(&mut dev, CMDQ_CONS));
+        assert_eq!(cons.rd(), cmd_idx, "All commands must be consumed");
+        let sync_val: u32 = gm.read_plain(SYNC_MSI_GPA).expect("read sync MSI");
+        assert_eq!(sync_val, 0xCCCC, "CFGI+SYNC completion must be signaled");
+
+        // =====================================================================
+        // Step 4: DMA — read/write through SmmuTranslatingMemory
+        // =====================================================================
+
+        // Create per-device wrappers.
+        let shared_state = dev.shared_state().clone();
+        let bus_range = AssignedBusRange::new();
+        bus_range.set_bus_range(BUS, BUS);
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, smmu_msi) =
+            shared_state.create_device_context(bus_range, STREAM_ID_BASE, &gm, mock_msi.clone());
+
+        // 4a. Write test data at DATA_GPA via raw guest memory.
+        let test_data = b"Hello from SMMU end-to-end test!";
+        gm.write_at(DATA_GPA, test_data).unwrap();
+
+        // 4b. Read via IOVA → should get data from DATA_GPA.
+        let mut buf = vec![0u8; test_data.len()];
+        translating_gm
+            .read_at(DMA_IOVA, &mut buf)
+            .expect("DMA read through SMMU must succeed");
+        assert_eq!(&buf, test_data, "Translated read must return correct data");
+
+        // 4c. Write via IOVA with an offset.
+        let write_data = b"DMA write OK";
+        let write_offset = 0x100u64;
+        translating_gm
+            .write_at(DMA_IOVA + write_offset, write_data)
+            .expect("DMA write through SMMU must succeed");
+
+        // Verify at raw GPA.
+        let mut verify_buf = vec![0u8; write_data.len()];
+        gm.read_at(DATA_GPA + write_offset, &mut verify_buf)
+            .unwrap();
+        assert_eq!(
+            &verify_buf, write_data,
+            "Translated write must land at correct GPA"
+        );
+
+        // =====================================================================
+        // Step 5: MSI — translate MSI address through SMMU
+        // =====================================================================
+
+        // Fire MSI with address = DOORBELL_IOVA + 0x40 (intra-page offset).
+        // The SMMU should translate DOORBELL_IOVA → DOORBELL_GPA.
+        // devid is a RID: (bus << 8 | devfn). Must be within the device's
+        // assigned bus range for the SMMU to accept it.
+        let device_rid = (BUS as u32) << 8; // devfn = 0
+        smmu_msi.signal_msi(Some(device_rid), DOORBELL_IOVA + 0x40, 0x1234);
+
+        let msi_calls = mock_msi.take_calls();
+        assert_eq!(msi_calls.len(), 1, "Exactly one MSI must be forwarded");
+        let (devid, addr, data) = &msi_calls[0];
+        assert_eq!(*devid, Some(device_rid), "devid must be passed through");
+        assert_eq!(
+            *addr,
+            DOORBELL_GPA + 0x40,
+            "MSI address must be translated with offset"
+        );
+        assert_eq!(*data, 0x1234, "MSI data must be passed through");
+
+        // =====================================================================
+        // Step 6: Fault — access unmapped IOVA, verify EVTQ event
+        // =====================================================================
+
+        // IOVA 0x1000_0000 has no page table mapping → translation fault.
+        let unmapped_iova: u64 = 0x1000_0000;
+        let mut fault_buf = [0u8; 4];
+        let result = translating_gm.read_at(unmapped_iova, &mut fault_buf);
+        assert!(result.is_err(), "Read from unmapped IOVA must return error");
+
+        // The fault event is queued in shared state. Trigger a drain by
+        // writing CMDQ_PROD (which drains pending events).
+        write32(&mut dev, CMDQ_PROD, cmd_idx); // No new commands, just drain.
+
+        // Verify EVTQ_PROD advanced (an event was written).
+        let evtq_prod = read32_page1(&mut dev, EVENTQ_PROD_PAGE1);
+        assert!(evtq_prod > 0, "EVTQ must have at least one event");
+
+        // Read the event from guest memory.
+        let event: EvtEntry = gm.read_plain(EVTQ_GPA).expect("read fault event");
+        assert_eq!(
+            event.event_id(),
+            EventId::F_TRANSLATION,
+            "Fault must be a translation fault"
+        );
+        assert_eq!(event.sid, STREAM_ID, "Fault SID must match device");
+        assert_eq!(
+            event.input_addr, unmapped_iova,
+            "Fault IOVA must match access"
+        );
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/lib.rs b/vm/devices/iommu/smmu/src/lib.rs
new file mode 100644
index 0000000000..56ec3af54c
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/lib.rs
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 emulator for OpenVMM.
+//!
+//! This crate implements an Arm SMMUv3 (System Memory Management Unit)
+//! emulator, providing IOVA→GPA translation for devices behind the SMMU.
+
+pub mod spec;
+
+mod emulator;
+mod shared;
+mod translate;
+
+pub use emulator::SmmuConfig;
+pub use emulator::SmmuDevice;
+pub use shared::SmmuSharedState;
+pub use shared::SmmuSignalMsi;
+pub use shared::SmmuTranslatingMemory;
diff --git a/vm/devices/iommu/smmu/src/shared.rs b/vm/devices/iommu/smmu/src/shared.rs
new file mode 100644
index 0000000000..42e8a0bf14
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/shared.rs
@@ -0,0 +1,1371 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Shared SMMU state and per-device translation wrappers.
+//!
+//! [`SmmuSharedState`] holds the SMMU configuration that per-device wrappers
+//! need for translation: stream table base, CR0 state, and a reference to
+//! guest memory for walking page tables.
+//!
+//! [`SmmuTranslatingMemory`] implements [`GuestMemoryAccess`], translating
+//! IOVAs to GPAs via the SMMU page tables before accessing the underlying
+//! guest memory.
+//!
+//! [`SmmuSignalMsi`] implements [`SignalMsi`], translating the MSI address
+//! (which may be an IOVA) to a GPA before forwarding to the inner MSI
+//! target.
+//!
+//! [`SmmuIrqFd`] implements [`IrqFd`](vmcore::irqfd::IrqFd), producing
+//! [`SmmuIrqFdRoute`] instances that translate the MSI address on
+//! [`enable`](vmcore::irqfd::IrqFdRoute::enable) before forwarding to the
+//! inner irqfd route.
+
+use crate::spec::events::EvtEntry;
+use crate::spec::registers;
+use crate::translate;
+use guestmem::GuestMemory;
+use guestmem::GuestMemoryBackingError;
+use pal_event::Event;
+use parking_lot::Mutex;
+use parking_lot::RwLock;
+use pci_core::bus_range::AssignedBusRange;
+use pci_core::msi::SignalMsi;
+use std::fmt;
+use std::ptr::NonNull;
+use std::sync::Arc;
+use vmcore::irqfd::IrqFd;
+use vmcore::irqfd::IrqFdRoute;
+use vmcore::line_interrupt::LineInterrupt;
+use zerocopy::IntoBytes;
+
+/// Composes an SMMU-local stream ID from a bus range, a base offset,
+/// and an optional per-device BDF.
+///
+/// The stream ID is `stream_id_base + (bdf & 0xFFFF)`. When `devid`
+/// is `None`, the default BDF `(secondary_bus, dev 0, fn 0)` is used.
+///
+/// Returns `None` if the secondary bus has not been assigned yet
+/// (still 0) or if the BDF's bus number falls outside the port's
+/// assigned range.
+fn compose_stream_id(
+    bus_range: &AssignedBusRange,
+    stream_id_base: u32,
+    devid: Option<u32>,
+) -> Option<u32> {
+    let (secondary, subordinate) = bus_range.bus_range();
+    if secondary == 0 {
+        return None;
+    }
+    let bdf = devid.unwrap_or((secondary as u32) << 8);
+    let bus = (bdf >> 8) as u8;
+    if bus < secondary || bus > subordinate {
+        tracelimit::warn_ratelimited!(bus, secondary, subordinate, "BDF out of port bus range");
+        return None;
+    }
+    Some(stream_id_base + (bdf & 0xFFFF))
+}
+
+/// Translation error for SMMU DMA access.
+#[derive(Debug)]
+struct SmmuTranslationError {
+    iova: u64,
+    msg: &'static str,
+}
+
+impl fmt::Display for SmmuTranslationError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "SMMU translation failed: {} at IOVA {:#x}",
+            self.msg, self.iova
+        )
+    }
+}
+
+impl std::error::Error for SmmuTranslationError {}
+
+/// Result of an SMMU translation attempt.
+#[derive(Debug)]
+enum TranslateResult {
+    /// SMMU disabled or bus not yet assigned — bypass (IOVA = GPA).
+    Bypass,
+    /// Translated GPA.
+    Translated(u64),
+    /// Abort — STE says to abort this stream's DMA.
+    Abort(EvtEntry),
+    /// Translation fault — event to queue.
+    Fault(EvtEntry),
+}
+
+/// Shared SMMU state accessed by per-device translation wrappers.
+///
+/// The SMMU device updates this state on register writes; per-device wrappers
+/// read it during translation. The `RwLock` allows concurrent translations
+/// (read path) while register writes (write path) are exclusive.
+///
+/// Queue and error state is behind a separate `Mutex` so that per-device
+/// wrappers can write fault events and signal overflow without going through
+/// the emulator.
+pub struct SmmuSharedState {
+    /// Translation configuration — RwLock for concurrent DMA reads.
+    inner: RwLock<SharedStateInner>,
+    /// Guest memory for reading page tables and stream table entries.
+    guest_memory: GuestMemory,
+    /// Event queue and global error state — single mutex covers both
+    /// because the EVTQ overflow path needs to update GERROR atomically.
+    queue_state: Mutex<QueueErrorState>,
+    /// Wired SPI interrupt line for event queue signaling.
+    evtq_irq: Option<LineInterrupt>,
+    /// Wired SPI interrupt line for global error signaling.
+    gerror_irq: Option<LineInterrupt>,
+}
+
+struct SharedStateInner {
+    /// Whether the SMMU is enabled (CR0.SMMUEN).
+    enabled: bool,
+    /// Stream table base address.
+    strtab_base: u64,
+    /// Stream table log2 size (number of entries).
+    strtab_log2size: u8,
+}
+
+/// Event queue and global error state.
+///
+/// A single mutex serializes event writes from concurrent DMA fault
+/// paths, GERROR updates from both the emulator and DMA overflow,
+/// and interrupt line level changes.
+struct QueueErrorState {
+    // -- Event queue --
+    /// EVTQ base GPA (parsed from EVTQ_BASE register).
+    evtq_base_addr: u64,
+    /// EVTQ log2 size (clamped to IDR1.EVENTQS).
+    evtq_log2size: u8,
+    /// Whether the event queue is enabled (CR0.EVENTQEN).
+    evtq_enabled: bool,
+    /// Whether the EVTQ interrupt is enabled (IRQ_CTRL.EVENTQ_IRQEN).
+    evtq_irqen: bool,
+    /// Producer index (advanced by the SMMU when writing events).
+    evtq_prod: u32,
+    /// Consumer index (advanced by the guest via MMIO).
+    evtq_cons: u32,
+
+    // -- Global error registers (toggle protocol) --
+    /// GERROR register — individual error bits toggled by the SMMU.
+    gerror: registers::Gerror,
+    /// GERRORN register — written by the guest to acknowledge errors.
+    gerrorn: registers::Gerror,
+    /// Whether the GERROR interrupt is enabled (IRQ_CTRL.GERROR_IRQEN).
+    gerror_irqen: bool,
+}
+
+impl SmmuSharedState {
+    /// Creates a new shared state with the SMMU disabled.
+    pub fn new(
+        guest_memory: GuestMemory,
+        evtq_irq: Option<LineInterrupt>,
+        gerror_irq: Option<LineInterrupt>,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            inner: RwLock::new(SharedStateInner {
+                enabled: false,
+                strtab_base: 0,
+                strtab_log2size: 0,
+            }),
+            guest_memory,
+            queue_state: Mutex::new(QueueErrorState {
+                evtq_base_addr: 0,
+                evtq_log2size: 0,
+                evtq_enabled: false,
+                evtq_irqen: false,
+                evtq_prod: 0,
+                evtq_cons: 0,
+                gerror: registers::Gerror::new(),
+                gerrorn: registers::Gerror::new(),
+                gerror_irqen: false,
+            }),
+            evtq_irq,
+            gerror_irq,
+        })
+    }
+
+    /// Updates the SMMU enable state (called by SmmuDevice on CR0 writes).
+    pub fn set_enabled(&self, enabled: bool) {
+        self.inner.write().enabled = enabled;
+    }
+
+    /// Updates the stream table configuration (called by SmmuDevice on
+    /// STRTAB_BASE / STRTAB_BASE_CFG writes).
+    pub fn set_strtab(&self, base: u64, log2size: u8) {
+        let mut inner = self.inner.write();
+        inner.strtab_base = base;
+        inner.strtab_log2size = log2size;
+    }
+
+    /// Updates the event queue configuration (called by SmmuDevice on
+    /// EVTQ_BASE writes).
+    pub fn set_evtq_config(&self, base_addr: u64, log2size: u8) {
+        let mut qs = self.queue_state.lock();
+        qs.evtq_base_addr = base_addr;
+        qs.evtq_log2size = log2size;
+    }
+
+    /// Updates the event queue enabled state (called on CR0 writes).
+    pub fn set_evtq_enabled(&self, enabled: bool) {
+        self.queue_state.lock().evtq_enabled = enabled;
+    }
+
+    /// Updates both interrupt enable flags from IRQ_CTRL (called on
+    /// IRQ_CTRL writes). Also updates the GERROR interrupt line level.
+    pub fn set_irq_ctrl(&self, evtq_irqen: bool, gerror_irqen: bool) {
+        let mut qs = self.queue_state.lock();
+        qs.evtq_irqen = evtq_irqen;
+        qs.gerror_irqen = gerror_irqen;
+        self.update_gerror_irq(&qs);
+    }
+
+    /// Reads the current GERROR register value.
+    pub fn read_gerror(&self) -> registers::Gerror {
+        self.queue_state.lock().gerror
+    }
+
+    /// Reads the current GERRORN register value.
+    pub fn read_gerrorn(&self) -> registers::Gerror {
+        self.queue_state.lock().gerrorn
+    }
+
+    /// Returns true if GERROR.CMDQ_ERR != GERRORN.CMDQ_ERR (error active).
+    pub fn cmdq_err_active(&self) -> bool {
+        let qs = self.queue_state.lock();
+        qs.gerror.cmdq_err() != qs.gerrorn.cmdq_err()
+    }
+
+    /// Writes GERRORN (guest acknowledging errors) and updates the
+    /// interrupt line level.
+    pub fn write_gerrorn(&self, value: u32) {
+        let mut qs = self.queue_state.lock();
+        qs.gerrorn = registers::Gerror::from(value);
+        self.update_gerror_irq(&qs);
+    }
+
+    /// Toggles GERROR.CMDQ_ERR to signal a command queue error.
+    ///
+    /// Updates the interrupt line level under the lock.
+    pub fn toggle_cmdq_err(&self) {
+        let mut qs = self.queue_state.lock();
+        let new_val = !qs.gerror.cmdq_err();
+        qs.gerror.set_cmdq_err(new_val);
+        self.update_gerror_irq(&qs);
+    }
+
+    /// Signals an EVTQ overflow by making GERROR.EVTQ_ABT_ERR active.
+    ///
+    /// Per spec, sets the bit to the inverse of GERRORN.EVTQ_ABT_ERR.
+    /// If the error is already active this is a no-op (the bit value
+    /// doesn't change). Called from `write_event` under the same lock.
+    fn signal_evtq_overflow(&self, qs: &mut QueueErrorState) {
+        let new_val = !qs.gerrorn.eventq_abt_err();
+        qs.gerror.set_eventq_abt_err(new_val);
+        self.update_gerror_irq(qs);
+    }
+
+    /// Updates the GERROR wired interrupt line level based on current state.
+    ///
+    /// Must be called with the queue_state lock held. The line is held
+    /// high while any error is active (GERROR != GERRORN) and deasserted
+    /// when all errors are acknowledged.
+    fn update_gerror_irq(&self, qs: &QueueErrorState) {
+        if let Some(irq) = &self.gerror_irq {
+            let active = qs.gerror_irqen && u32::from(qs.gerror) != u32::from(qs.gerrorn);
+            irq.set_level(active);
+        }
+    }
+
+    /// Updates the event queue consumer index (called when the guest
+    /// writes EVENTQ_CONS on page 1).
+    ///
+    /// Deasserts the EVTQ wired interrupt if the queue is now empty.
+    pub fn set_evtq_cons(&self, cons: u32) {
+        let mut qs = self.queue_state.lock();
+        qs.evtq_cons = cons;
+        // Deassert EVTQ IRQ when the guest has drained all events.
+        if qs.evtq_irqen && qs.evtq_prod == qs.evtq_cons {
+            if let Some(irq) = &self.evtq_irq {
+                irq.set_level(false);
+            }
+        }
+    }
+
+    /// Returns the current event queue producer index (for guest reads
+    /// of EVENTQ_PROD on page 1).
+    pub fn evtq_prod(&self) -> u32 {
+        self.queue_state.lock().evtq_prod
+    }
+
+    /// Returns the current event queue consumer index (for guest reads
+    /// of EVENTQ_CONS on page 1).
+    pub fn evtq_cons(&self) -> u32 {
+        self.queue_state.lock().evtq_cons
+    }
+
+    /// Resets event queue and GERROR state (called on device reset).
+    pub fn reset_queue_state(&self) {
+        let mut qs = self.queue_state.lock();
+        qs.evtq_base_addr = 0;
+        qs.evtq_log2size = 0;
+        qs.evtq_enabled = false;
+        qs.evtq_irqen = false;
+        qs.evtq_prod = 0;
+        qs.evtq_cons = 0;
+        qs.gerror = registers::Gerror::new();
+        qs.gerrorn = registers::Gerror::new();
+        qs.gerror_irqen = false;
+        self.update_gerror_irq(&qs);
+    }
+
+    /// Translate an IOVA to a GPA for the given stream ID.
+    ///
+    /// Callers that need to hold the lock across translation and a subsequent
+    /// memory access should use [`translate_with`] instead.
+    fn translate(&self, sid: u32, iova: u64, write: bool) -> TranslateResult {
+        let inner = self.inner.read();
+        self.translate_locked(&inner, sid, iova, write)
+    }
+
+    /// Translate an IOVA to a GPA while holding the read lock.
+    ///
+    /// The caller holds `inner` across both translation and the subsequent
+    /// memory access, preventing SMMU config changes (disable, stream table
+    /// base update) from creating a TOCTOU between translation and access.
+    fn translate_locked(
+        &self,
+        inner: &SharedStateInner,
+        sid: u32,
+        iova: u64,
+        write: bool,
+    ) -> TranslateResult {
+        if !inner.enabled {
+            return TranslateResult::Bypass;
+        }
+
+        // Look up the STE.
+        let ste = match translate::lookup_ste(
+            &self.guest_memory,
+            inner.strtab_base,
+            inner.strtab_log2size,
+            sid,
+        ) {
+            Ok(ste) => ste,
+            Err(fault) => return TranslateResult::Fault(fault.event),
+        };
+
+        // Dispatch on STE config.
+        let action = match translate::ste_config_action(&ste) {
+            Ok(action) => action,
+            Err(_) => return TranslateResult::Fault(EvtEntry::bad_ste(sid)),
+        };
+
+        match action {
+            translate::SteAction::Abort => TranslateResult::Abort(EvtEntry::bad_ste(sid)),
+            translate::SteAction::Bypass => TranslateResult::Bypass,
+            translate::SteAction::S1Translate => {
+                // Look up the CD.
+                let cd = match translate::lookup_cd(&self.guest_memory, &ste, sid, 0) {
+                    Ok(cd) => cd,
+                    Err(fault) => return TranslateResult::Fault(fault.event),
+                };
+
+                // Extract translation context.
+                let ctx = match translate::translation_context(&cd, sid) {
+                    Ok(ctx) => ctx,
+                    Err(fault) => return TranslateResult::Fault(fault.event),
+                };
+
+                // Walk the page table.
+                match translate::walk_s1(&self.guest_memory, &ctx, iova, write, sid) {
+                    Ok(tr) => TranslateResult::Translated(tr.gpa),
+                    Err(fault) => TranslateResult::Fault(fault.event),
+                }
+            }
+        }
+    }
+
+    /// Write an event record directly to the guest's event queue.
+    ///
+    /// Called from per-device DMA fault paths and from the emulator's
+    /// command processing. If the queue is full, drops the event and
+    /// logs a warning. If an event is successfully written, pulses
+    /// the EVTQ wired SPI interrupt (if enabled).
+    pub fn write_event(&self, event: EvtEntry) {
+        let mut qs = self.queue_state.lock();
+        if !qs.evtq_enabled {
+            return;
+        }
+
+        let max_entries = 1u32 << qs.evtq_log2size;
+        let index_mask = (max_entries << 1) - 1;
+        let prod = qs.evtq_prod & index_mask;
+        let cons = qs.evtq_cons & index_mask;
+
+        // Check if the queue is full. Full when the index bits match but
+        // the wrap bit differs: (prod ^ cons) == max_entries.
+        if (prod ^ cons) == max_entries {
+            // Signal EVTQ overflow via GERROR.EVTQ_ABT_ERR — updates
+            // the GERROR register and interrupt line under the same lock.
+            self.signal_evtq_overflow(&mut qs);
+            tracelimit::warn_ratelimited!("smmu: EVTQ full, dropping event");
+            return;
+        }
+
+        // Write the 32-byte event record to guest memory.
+        let index = prod & (max_entries - 1);
+        let entry_addr = qs.evtq_base_addr + (index as u64) * (EvtEntry::SIZE as u64);
+
+        if let Err(e) = self.guest_memory.write_at(entry_addr, event.as_bytes()) {
+            tracelimit::warn_ratelimited!(
+                error = &e as &dyn std::error::Error,
+                entry_addr,
+                "smmu: failed to write EVTQ entry to guest memory"
+            );
+            return;
+        }
+
+        // Advance EVTQ_PROD.
+        qs.evtq_prod = (prod + 1) & index_mask;
+
+        // Assert EVTQ wired interrupt — held high while queue is non-empty.
+        // Deasserted when the guest drains events via CONS writes.
+        if qs.evtq_irqen {
+            if let Some(irq) = &self.evtq_irq {
+                irq.set_level(true);
+            }
+        }
+    }
+
+    /// Creates per-device wrappers for a PCI device behind this SMMU.
+    ///
+    /// `stream_id_base` is the offset into this SMMU's stream table for the
+    /// root complex this device belongs to (from the IORT `ID_MAPPING`).
+    /// The stream ID for translation is `stream_id_base + bdf`.
+    ///
+    /// Returns a `GuestMemory` that translates IOVAs via the SMMU, and a
+    /// `SignalMsi` implementation that translates MSI addresses.
+    pub fn create_device_context(
+        self: &Arc<Self>,
+        bus_range: AssignedBusRange,
+        stream_id_base: u32,
+        inner_gm: &GuestMemory,
+        inner_msi: Arc<dyn SignalMsi>,
+    ) -> (GuestMemory, Arc<SmmuSignalMsi>) {
+        let translating_mem = SmmuTranslatingMemory {
+            shared: self.clone(),
+            bus_range: bus_range.clone(),
+            stream_id_base,
+            inner_gm: inner_gm.clone(),
+        };
+
+        let gm = GuestMemory::new("smmu-translating", translating_mem);
+
+        let signal_msi = Arc::new(SmmuSignalMsi {
+            shared: self.clone(),
+            stream_id_base,
+            inner: inner_msi,
+        });
+
+        (gm, signal_msi)
+    }
+
+    /// Creates an SMMU irqfd wrapper for a PCI device behind this SMMU.
+    ///
+    /// `stream_id_base` is the offset into this SMMU's stream table for the
+    /// root complex this device belongs to.
+    ///
+    /// Irqfd routes created from the returned wrapper will translate MSI
+    /// addresses through the SMMU page tables before programming the
+    /// kernel route.
+    pub fn create_irqfd(
+        self: &Arc<Self>,
+        stream_id_base: u32,
+        inner: Arc<dyn IrqFd>,
+    ) -> Arc<SmmuIrqFd> {
+        Arc::new(SmmuIrqFd {
+            shared: self.clone(),
+            stream_id_base,
+            inner,
+        })
+    }
+}
+
+/// A [`guestmem::GuestMemoryAccess`] implementation that translates IOVAs via the SMMU.
+///
+/// Each PCI device behind the SMMU gets its own `SmmuTranslatingMemory`.
+/// When the device reads or writes guest memory using an IOVA, this
+/// wrapper translates the IOVA to a GPA using the SMMU page tables, then
+/// delegates to the underlying guest memory.
+pub struct SmmuTranslatingMemory {
+    shared: Arc<SmmuSharedState>,
+    bus_range: AssignedBusRange,
+    /// Offset into the SMMU's stream table for this root complex.
+    stream_id_base: u32,
+    inner_gm: GuestMemory,
+}
+
+impl SmmuTranslatingMemory {
+    /// Perform a translated memory operation, handling page-crossing accesses.
+    ///
+    /// Holds the SMMU read lock across both translation and memory access
+    /// for each page chunk, preventing config changes between translation
+    /// and the actual DMA. Splits at page boundaries when the IOVA range
+    /// spans multiple pages (which may have different translations).
+    fn do_translated_op(
+        &self,
+        iova: u64,
+        len: usize,
+        write: bool,
+        mut op: impl FnMut(u64, usize, usize) -> Result<(), GuestMemoryBackingError>,
+    ) -> Result<(), GuestMemoryBackingError> {
+        let sid = match compose_stream_id(&self.bus_range, self.stream_id_base, None) {
+            Some(sid) => sid,
+            None => {
+                // Bus not assigned — bypass, no lock needed.
+                let mut offset = 0usize;
+                let mut remaining = len;
+                while remaining > 0 {
+                    let current_iova = iova.wrapping_add(offset as u64);
+                    let page_offset = (current_iova & 0xFFF) as usize;
+                    let bytes_in_page = (0x1000 - page_offset).min(remaining);
+                    op(current_iova, offset, bytes_in_page)?;
+                    offset += bytes_in_page;
+                    remaining -= bytes_in_page;
+                }
+                return Ok(());
+            }
+        };
+
+        let mut offset = 0usize;
+        let mut remaining = len;
+
+        while remaining > 0 {
+            let current_iova = iova.wrapping_add(offset as u64);
+
+            // Compute how many bytes until the next page boundary.
+            let page_offset = (current_iova & 0xFFF) as usize;
+            let bytes_in_page = (0x1000 - page_offset).min(remaining);
+
+            // Hold the read lock across translate + memory access to prevent
+            // SMMU config from changing between getting the GPA and using it.
+            let inner = self.shared.inner.read();
+            let gpa = match self
+                .shared
+                .translate_locked(&inner, sid, current_iova, write)
+            {
+                TranslateResult::Bypass => current_iova,
+                TranslateResult::Translated(gpa) => gpa,
+                TranslateResult::Abort(event) => {
+                    drop(inner);
+                    self.shared.write_event(event);
+                    return Err(GuestMemoryBackingError::other(
+                        current_iova,
+                        SmmuTranslationError {
+                            iova: current_iova,
+                            msg: "DMA aborted by STE config",
+                        },
+                    ));
+                }
+                TranslateResult::Fault(event) => {
+                    drop(inner);
+                    self.shared.write_event(event);
+                    return Err(GuestMemoryBackingError::other(
+                        current_iova,
+                        SmmuTranslationError {
+                            iova: current_iova,
+                            msg: "translation fault",
+                        },
+                    ));
+                }
+            };
+
+            op(gpa, offset, bytes_in_page)?;
+            drop(inner);
+
+            offset += bytes_in_page;
+            remaining -= bytes_in_page;
+        }
+
+        Ok(())
+    }
+}
+
+// UNSAFETY: SmmuTranslatingMemory returns `None` from `mapping()`, so the
+// caller never gets a raw pointer. All accesses go through the fallback
+// methods which translate IOVAs to GPAs and delegate to the inner
+// GuestMemory. The inner GuestMemory is itself safe.
+#[expect(unsafe_code)]
+unsafe impl guestmem::GuestMemoryAccess for SmmuTranslatingMemory {
+    fn mapping(&self) -> Option<NonNull<u8>> {
+        // Force all accesses through the fallback path for translation.
+        None
+    }
+
+    fn max_address(&self) -> u64 {
+        // IOVAs can use the full address range; translation determines
+        // the actual valid range.
+        u64::MAX
+    }
+
+    unsafe fn read_fallback(
+        &self,
+        addr: u64,
+        dest: *mut u8,
+        len: usize,
+    ) -> Result<(), GuestMemoryBackingError> {
+        self.do_translated_op(addr, len, false, |gpa, offset, chunk_len| {
+            // SAFETY: dest is valid for len bytes per the trait contract.
+            // We slice into dest at the correct offset.
+            let chunk_dest = unsafe { std::slice::from_raw_parts_mut(dest.add(offset), chunk_len) };
+            self.inner_gm
+                .read_at(gpa, chunk_dest)
+                .map_err(|e| GuestMemoryBackingError::other(addr, e))
+        })
+    }
+
+    unsafe fn write_fallback(
+        &self,
+        addr: u64,
+        src: *const u8,
+        len: usize,
+    ) -> Result<(), GuestMemoryBackingError> {
+        self.do_translated_op(addr, len, true, |gpa, offset, chunk_len| {
+            // SAFETY: src is valid for len bytes per the trait contract.
+            let chunk_src = unsafe { std::slice::from_raw_parts(src.add(offset), chunk_len) };
+            self.inner_gm
+                .write_at(gpa, chunk_src)
+                .map_err(|e| GuestMemoryBackingError::other(addr, e))
+        })
+    }
+
+    fn fill_fallback(&self, addr: u64, val: u8, len: usize) -> Result<(), GuestMemoryBackingError> {
+        self.do_translated_op(addr, len, true, |gpa, _offset, chunk_len| {
+            self.inner_gm
+                .fill_at(gpa, val, chunk_len)
+                .map_err(|e| GuestMemoryBackingError::other(addr, e))
+        })
+    }
+}
+
+/// A [`SignalMsi`] wrapper that translates MSI addresses through the SMMU.
+///
+/// When a device behind the SMMU fires an MSI, the MSI address may be an
+/// IOVA (Linux maps MSI doorbell pages into the device's IOVA space via
+/// `iommu_dma_prepare_msi()`). This wrapper translates the address before
+/// forwarding to the inner MSI target (typically an ITS or GICv2m wrapper).
+pub struct SmmuSignalMsi {
+    shared: Arc<SmmuSharedState>,
+    /// Offset into the SMMU's stream table for this root complex.
+    stream_id_base: u32,
+    inner: Arc<dyn SignalMsi>,
+}
+
+impl SignalMsi for SmmuSignalMsi {
+    fn signal_msi(&self, devid: Option<u32>, address: u64, data: u32) {
+        // MsiTarget resolves devid to a BDF before calling us.
+        let Some(bdf) = devid else {
+            return;
+        };
+        let sid = self.stream_id_base + (bdf & 0xFFFF);
+
+        match self.shared.translate(sid, address, true) {
+            TranslateResult::Bypass => {
+                self.inner.signal_msi(devid, address, data);
+            }
+            TranslateResult::Translated(gpa) => {
+                self.inner.signal_msi(devid, gpa, data);
+            }
+            TranslateResult::Abort(event) => {
+                self.shared.write_event(event);
+                tracelimit::warn_ratelimited!(sid, address, "smmu: MSI aborted by STE config");
+            }
+            TranslateResult::Fault(event) => {
+                self.shared.write_event(event);
+                tracelimit::warn_ratelimited!(sid, address, "smmu: MSI translation fault");
+            }
+        }
+    }
+}
+
+/// An [`IrqFd`] wrapper that produces SMMU-translating irqfd routes.
+///
+/// When a device behind the SMMU programs its MSI-X table, the MSI address
+/// may be an IOVA. This wrapper creates [`SmmuIrqFdRoute`] instances that
+/// translate the address through the SMMU before forwarding to the inner
+/// irqfd route (which may itself be an ITS wrapper).
+pub struct SmmuIrqFd {
+    shared: Arc<SmmuSharedState>,
+    /// Offset into the SMMU's stream table for this root complex.
+    stream_id_base: u32,
+    inner: Arc<dyn IrqFd>,
+}
+
+impl IrqFd for SmmuIrqFd {
+    fn new_irqfd_route(&self) -> anyhow::Result<Box<dyn IrqFdRoute>> {
+        let inner_route = self.inner.new_irqfd_route()?;
+        Ok(Box::new(SmmuIrqFdRoute {
+            shared: self.shared.clone(),
+            stream_id_base: self.stream_id_base,
+            inner: inner_route,
+        }))
+    }
+}
+
+/// An [`IrqFdRoute`] wrapper that translates the MSI address through the
+/// SMMU on [`enable`](IrqFdRoute::enable).
+///
+/// Translation happens at route-programming time (when the guest writes
+/// the MSI-X table), not per-interrupt. If the guest changes SMMU page
+/// tables after programming MSI-X, it must also re-program the MSI-X
+/// entry (which is the normal flow — the IOMMU driver does this).
+struct SmmuIrqFdRoute {
+    shared: Arc<SmmuSharedState>,
+    /// Offset into the SMMU's stream table for this root complex.
+    stream_id_base: u32,
+    inner: Box<dyn IrqFdRoute>,
+}
+
+impl IrqFdRoute for SmmuIrqFdRoute {
+    fn event(&self) -> &Event {
+        self.inner.event()
+    }
+
+    fn enable(&self, address: u64, data: u32, devid: Option<u32>) {
+        // MsiRoute resolves devid to a BDF before calling us.
+        let Some(bdf) = devid else {
+            return;
+        };
+        let sid = self.stream_id_base + (bdf & 0xFFFF);
+
+        match self.shared.translate(sid, address, true) {
+            TranslateResult::Bypass => {
+                self.inner.enable(address, data, devid);
+            }
+            TranslateResult::Translated(gpa) => {
+                self.inner.enable(gpa, data, devid);
+            }
+            TranslateResult::Abort(event) => {
+                self.shared.write_event(event);
+                tracelimit::warn_ratelimited!(
+                    sid,
+                    address,
+                    "smmu: irqfd MSI route aborted by STE config"
+                );
+            }
+            TranslateResult::Fault(event) => {
+                self.shared.write_event(event);
+                tracelimit::warn_ratelimited!(
+                    sid,
+                    address,
+                    "smmu: irqfd MSI route translation fault"
+                );
+            }
+        }
+    }
+
+    fn disable(&self) {
+        self.inner.disable();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::spec::cd::CD_SIZE;
+    use crate::spec::cd::CdDw0;
+    use crate::spec::cd::CdDw1;
+    use crate::spec::cd::Ips;
+    use crate::spec::cd::Tg0;
+    use crate::spec::events::EventId;
+    use crate::spec::pt::ApBits;
+    use crate::spec::pt::PtDesc;
+    use crate::spec::ste::STE_SIZE;
+    use crate::spec::ste::Ste;
+    use crate::spec::ste::SteConfig;
+    use crate::spec::ste::SteDw0;
+    use crate::spec::ste::SteDw1;
+    use parking_lot::Mutex;
+    use std::sync::Arc;
+
+    // Memory layout for tests.
+    const STRTAB_BASE: u64 = 0x10_0000;
+    const STRTAB_LOG2SIZE: u8 = 10;
+    const CD_BASE: u64 = 0x20_0000;
+    const PT_L1_BASE: u64 = 0x30_1000;
+    const PT_L2_BASE: u64 = 0x30_2000;
+    const PT_L3_BASE: u64 = 0x30_3000;
+    const DATA_GPA: u64 = 0x4000_0000;
+    /// EVTQ base GPA for tests (must not overlap other test regions).
+    const EVTQ_BASE: u64 = 0x4100_0000;
+    /// EVTQ log2 size for tests (3 = 8 entries).
+    const EVTQ_LOG2SIZE: u8 = 3;
+    const TEST_SEGMENT: u16 = 0;
+    /// Stream ID base for the test root complex (matches IORT output_base).
+    const TEST_STREAM_ID_BASE: u32 = (TEST_SEGMENT as u32) << 16;
+    const TEST_BUS: u8 = 1;
+    /// The RID for the test device: (bus << 8) | devfn.
+    const TEST_RID: u32 = (TEST_BUS as u32) << 8;
+
+    /// A mock SignalMsi that records calls.
+    struct MockSignalMsi {
+        calls: Mutex<Vec<(Option<u32>, u64, u32)>>,
+    }
+
+    impl MockSignalMsi {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                calls: Mutex::new(Vec::new()),
+            })
+        }
+
+        fn take_calls(&self) -> Vec<(Option<u32>, u64, u32)> {
+            std::mem::take(&mut *self.calls.lock())
+        }
+    }
+
+    impl SignalMsi for MockSignalMsi {
+        fn signal_msi(&self, devid: Option<u32>, address: u64, data: u32) {
+            self.calls.lock().push((devid, address, data));
+        }
+    }
+
+    fn make_bus_range() -> AssignedBusRange {
+        let br = AssignedBusRange::new();
+        br.set_bus_range(TEST_BUS, TEST_BUS);
+        br
+    }
+
+    fn expected_sid() -> u32 {
+        TEST_STREAM_ID_BASE + ((TEST_BUS as u32) << 8)
+    }
+
+    fn write_ste(gm: &GuestMemory, sid: u32, ste: &Ste) {
+        let addr = STRTAB_BASE + (sid as u64) * (STE_SIZE as u64);
+        gm.write_plain(addr, ste).expect("write STE");
+    }
+
+    fn make_s1_ste(cd_base: u64) -> Ste {
+        use crate::spec::cd::CD_SIZE;
+        let _ = CD_SIZE;
+        Ste {
+            qw0: SteDw0::new()
+                .with_v(true)
+                .with_config(SteConfig::S1_TRANS.0)
+                .with_s1_context_ptr(cd_base >> 6)
+                .with_s1_cd_max(0),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        }
+    }
+
+    fn make_bypass_ste() -> Ste {
+        Ste {
+            qw0: SteDw0::new().with_v(true).with_config(SteConfig::BYPASS.0),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        }
+    }
+
+    fn make_abort_ste() -> Ste {
+        Ste {
+            qw0: SteDw0::new().with_v(true).with_config(SteConfig::ABORT.0),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        }
+    }
+
+    fn write_cd(gm: &GuestMemory, cd_base: u64, ssid: u32) {
+        use crate::spec::cd::Cd;
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_v(true)
+                .with_t0sz(32)
+                .with_tg0(Tg0::GRAN_4K.0)
+                .with_ips(Ips::IPS_40.0)
+                .with_aa64(true)
+                .with_asid(1),
+            qw1: CdDw1::new().with_ttb0(PT_L1_BASE >> 4),
+            _qw2: 0,
+            mair0: 0xFF440C0400,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        let addr = cd_base + (ssid as u64) * (CD_SIZE as u64);
+        gm.write_plain(addr, &cd).expect("write CD");
+    }
+
+    fn table_desc(next_table: u64) -> u64 {
+        PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(next_table >> 12)
+            .into()
+    }
+
+    fn page_desc(output_addr: u64) -> u64 {
+        PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_af(true)
+            .with_ap(ApBits::RW_EL1.0)
+            .with_addr_bits(output_addr >> 12)
+            .into()
+    }
+
+    fn write_pt_desc(gm: &GuestMemory, addr: u64, desc: u64) {
+        gm.write_plain(addr, &desc).expect("write PT desc");
+    }
+
+    /// Set up a complete SMMU translation context:
+    /// STE (S1_TRANS) → CD → page table mapping IOVA 0..4K → DATA_GPA.
+    fn setup_translation(gm: &GuestMemory, sid: u32) {
+        // Write STE.
+        write_ste(gm, sid, &make_s1_ste(CD_BASE));
+        // Write CD.
+        write_cd(gm, CD_BASE, 0);
+        // Build 3-level page table (T0SZ=32, 4K granule: L1, L2, L3).
+        // L1[0] → L2
+        write_pt_desc(gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3
+        write_pt_desc(gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[0] → page at DATA_GPA
+        write_pt_desc(gm, PT_L3_BASE, page_desc(DATA_GPA));
+    }
+
+    fn make_shared_state(gm: &GuestMemory) -> Arc<SmmuSharedState> {
+        let state = SmmuSharedState::new(gm.clone(), None, None);
+        state.set_strtab(STRTAB_BASE, STRTAB_LOG2SIZE);
+        state.set_enabled(true);
+        // Enable EVTQ so fault events are written to guest memory.
+        state.set_evtq_config(EVTQ_BASE, EVTQ_LOG2SIZE);
+        state.set_evtq_enabled(true);
+        state
+    }
+
+    /// Count events in the EVTQ by reading EVTQ_PROD from shared state.
+    fn evtq_event_count(state: &SmmuSharedState) -> u32 {
+        state.evtq_prod()
+    }
+
+    // =========================================================================
+    // SmmuTranslatingMemory tests
+    // =========================================================================
+
+    #[test]
+    fn test_translating_memory_basic_read() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+        setup_translation(&gm, sid);
+
+        // Write test data at the physical GPA.
+        let data = b"hello SMMU";
+        gm.write_at(DATA_GPA, data).unwrap();
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Read via IOVA 0 → should get data from DATA_GPA.
+        let mut buf = vec![0u8; data.len()];
+        translating_gm.read_at(0, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[test]
+    fn test_translating_memory_basic_write() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+        setup_translation(&gm, sid);
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Write via IOVA.
+        let data = b"write test";
+        translating_gm.write_at(0, data).unwrap();
+
+        // Verify data appears at the physical GPA.
+        let mut buf = vec![0u8; data.len()];
+        gm.read_at(DATA_GPA, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[test]
+    fn test_translating_memory_with_offset() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+        setup_translation(&gm, sid);
+
+        // Write data at GPA + 0x100.
+        let data = b"offset data";
+        gm.write_at(DATA_GPA + 0x100, data).unwrap();
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Read via IOVA 0x100 → DATA_GPA + 0x100.
+        let mut buf = vec![0u8; data.len()];
+        translating_gm.read_at(0x100, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[test]
+    fn test_translating_memory_cross_page() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        // Set up STE and CD.
+        write_ste(&gm, sid, &make_s1_ste(CD_BASE));
+        write_cd(&gm, CD_BASE, 0);
+
+        // Map two adjacent pages:
+        // L3[0] → DATA_GPA (page at IOVA 0x0000)
+        // L3[1] → DATA_GPA + 0x2000 (page at IOVA 0x1000)
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA));
+        write_pt_desc(&gm, PT_L3_BASE + 8, page_desc(DATA_GPA + 0x2000));
+
+        // Write data spanning the page boundary.
+        let data_page1 = vec![0xAAu8; 0x10];
+        let data_page2 = vec![0xBBu8; 0x10];
+        gm.write_at(DATA_GPA + 0xFF0, &data_page1).unwrap();
+        gm.write_at(DATA_GPA + 0x2000, &data_page2).unwrap();
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Read 32 bytes starting at IOVA 0xFF0, crossing into page 2.
+        let mut buf = vec![0u8; 0x20];
+        translating_gm.read_at(0xFF0, &mut buf).unwrap();
+        assert_eq!(&buf[..0x10], &data_page1);
+        assert_eq!(&buf[0x10..], &data_page2);
+    }
+
+    #[test]
+    fn test_translating_memory_bypass() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        // STE in bypass mode.
+        write_ste(&gm, sid, &make_bypass_ste());
+
+        // Write data at GPA 0x1000.
+        let data = b"bypass data";
+        gm.write_at(0x1000, data).unwrap();
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Read via IOVA = GPA (identity mapping in bypass mode).
+        let mut buf = vec![0u8; data.len()];
+        translating_gm.read_at(0x1000, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[test]
+    fn test_translating_memory_abort() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        // STE in abort mode.
+        write_ste(&gm, sid, &make_abort_ste());
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Read should fail.
+        let mut buf = vec![0u8; 4];
+        let result = translating_gm.read_at(0, &mut buf);
+        assert!(result.is_err());
+
+        // Should have written an event to the EVTQ.
+        assert_eq!(evtq_event_count(&state), 1);
+    }
+
+    #[test]
+    fn test_translating_memory_unmapped() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        // Set up STE and CD, but NO page table entries (L1 is all zeros).
+        write_ste(&gm, sid, &make_s1_ste(CD_BASE));
+        write_cd(&gm, CD_BASE, 0);
+        // L1 is all zeros → translation fault.
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        let mut buf = vec![0u8; 4];
+        let result = translating_gm.read_at(0, &mut buf);
+        assert!(result.is_err());
+
+        // Should have written a fault event to the EVTQ.
+        assert_eq!(evtq_event_count(&state), 1);
+        // Read the event from the EVTQ in guest memory.
+        let written: EvtEntry = gm.read_plain(EVTQ_BASE).expect("read event");
+        assert_eq!(written.event_id(), EventId::F_TRANSLATION);
+    }
+
+    #[test]
+    fn test_translating_memory_unassigned_bus() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+
+        // Write data at GPA 0x2000.
+        let data = b"unassigned bus data";
+        gm.write_at(0x2000, data).unwrap();
+
+        let state = make_shared_state(&gm);
+        // Bus range NOT assigned (secondary_bus = 0).
+        let bus_range = AssignedBusRange::new();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Should bypass translation (IOVA = GPA).
+        let mut buf = vec![0u8; data.len()];
+        translating_gm.read_at(0x2000, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[test]
+    fn test_translating_memory_smmu_disabled() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+
+        // Write data at GPA 0x3000.
+        let data = b"disabled smmu";
+        gm.write_at(0x3000, data).unwrap();
+
+        let state = SmmuSharedState::new(gm.clone(), None, None);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi);
+
+        // Should bypass translation.
+        let mut buf = vec![0u8; data.len()];
+        translating_gm.read_at(0x3000, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    // =========================================================================
+    // SmmuSignalMsi tests
+    // =========================================================================
+
+    #[test]
+    fn test_signal_msi_translated() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+        setup_translation(&gm, sid);
+
+        // Also map a doorbell page: IOVA 0x800 → DATA_GPA + 0x1000.
+        write_pt_desc(&gm, PT_L3_BASE + 8, page_desc(DATA_GPA + 0x1000));
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (_gm, smmu_msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone());
+
+        // Fire MSI with IOVA address 0x1040 (page 1 + offset 0x40).
+        // devid is a RID — the SMMU combines it with segment to get the SID.
+        smmu_msi.signal_msi(Some(TEST_RID), 0x1040, 0xDEAD);
+
+        let calls = mock_msi.take_calls();
+        assert_eq!(calls.len(), 1);
+        // Translated address: DATA_GPA + 0x1000 + 0x40.
+        assert_eq!(calls[0], (Some(TEST_RID), DATA_GPA + 0x1040, 0xDEAD));
+    }
+
+    #[test]
+    fn test_signal_msi_bypass() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        write_ste(&gm, sid, &make_bypass_ste());
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (_gm, smmu_msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone());
+
+        // MsiTarget resolves devid to a BDF before calling SmmuSignalMsi.
+        smmu_msi.signal_msi(Some(TEST_RID), 0xFEE0_0000, 0x42);
+
+        let calls = mock_msi.take_calls();
+        assert_eq!(calls.len(), 1);
+        assert_eq!(calls[0], (Some(TEST_RID), 0xFEE0_0000, 0x42));
+    }
+
+    #[test]
+    fn test_signal_msi_unmapped() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        // STE with S1 translation, but no page table entries.
+        write_ste(&gm, sid, &make_s1_ste(CD_BASE));
+        write_cd(&gm, CD_BASE, 0);
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (_gm, smmu_msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone());
+
+        // Fire MSI with unmapped address. devid is a RID.
+        smmu_msi.signal_msi(Some(TEST_RID), 0x1000, 0x42);
+
+        // MSI should NOT be forwarded.
+        let calls = mock_msi.take_calls();
+        assert!(calls.is_empty());
+
+        // Fault event should be written to the EVTQ.
+        assert_eq!(evtq_event_count(&state), 1);
+    }
+
+    #[test]
+    fn test_signal_msi_devid_passthrough() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let sid = expected_sid();
+
+        write_ste(&gm, sid, &make_bypass_ste());
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (_gm, smmu_msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone());
+
+        // devid (RID) should be passed through unchanged to the inner MSI.
+        smmu_msi.signal_msi(Some(TEST_RID), 0x1000, 0x42);
+
+        let calls = mock_msi.take_calls();
+        assert_eq!(calls.len(), 1);
+        assert_eq!(calls[0].0, Some(TEST_RID));
+    }
+
+    #[test]
+    fn test_signal_msi_no_devid() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+
+        let state = make_shared_state(&gm);
+        let bus_range = make_bus_range();
+        let mock_msi = MockSignalMsi::new();
+
+        let (_gm, smmu_msi) =
+            state.create_device_context(bus_range, TEST_STREAM_ID_BASE, &gm, mock_msi.clone());
+
+        // devid=None means no BDF — MSI should be dropped.
+        smmu_msi.signal_msi(None, 0xFEE0_0000, 0x42);
+
+        let calls = mock_msi.take_calls();
+        assert_eq!(calls.len(), 0);
+    }
+
+    // =========================================================================
+    // Stream ID remapping tests (non-zero stream_id_base)
+    // =========================================================================
+
+    #[test]
+    fn test_translating_memory_nonzero_stream_id_base() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+
+        // Use a non-zero stream_id_base (simulating a second root complex
+        // with its own region in the SMMU stream table).
+        // stream_id_base=256, bus=1 → SID = 256 + 256 = 512 (within 1024).
+        let stream_id_base: u32 = 256;
+        let bus: u8 = 1;
+        let sid = stream_id_base + ((bus as u32) << 8);
+
+        // Set up translation for the remapped stream ID.
+        write_ste(&gm, sid, &make_s1_ste(CD_BASE));
+        write_cd(&gm, CD_BASE, 0);
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA));
+
+        let data = b"remapped sid test";
+        gm.write_at(DATA_GPA, data).unwrap();
+
+        let state = make_shared_state(&gm);
+        let bus_range = AssignedBusRange::new();
+        bus_range.set_bus_range(bus, bus);
+        let mock_msi = MockSignalMsi::new();
+
+        let (translating_gm, _msi) =
+            state.create_device_context(bus_range, stream_id_base, &gm, mock_msi);
+
+        // Read via IOVA 0 → should find the STE at the remapped stream ID.
+        let mut buf = vec![0u8; data.len()];
+        translating_gm.read_at(0, &mut buf).unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[test]
+    fn test_signal_msi_nonzero_stream_id_base() {
+        let gm = GuestMemory::allocate(0x5000_0000);
+
+        // Non-zero base (different root complex).
+        let stream_id_base: u32 = 256;
+        let bus: u8 = 1;
+        let sid = stream_id_base + ((bus as u32) << 8);
+
+        // Set up bypass STE for the remapped stream ID.
+        write_ste(&gm, sid, &make_bypass_ste());
+
+        let state = make_shared_state(&gm);
+        let bus_range = AssignedBusRange::new();
+        bus_range.set_bus_range(bus, bus);
+        let mock_msi = MockSignalMsi::new();
+
+        let (_gm, smmu_msi) =
+            state.create_device_context(bus_range, stream_id_base, &gm, mock_msi.clone());
+
+        // Fire MSI — bypass mode means address passes through unchanged.
+        let rid = (bus as u32) << 8;
+        smmu_msi.signal_msi(Some(rid), 0xFEE0_0000, 0x99);
+
+        let calls = mock_msi.take_calls();
+        assert_eq!(calls.len(), 1);
+        assert_eq!(calls[0], (Some(rid), 0xFEE0_0000, 0x99));
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/spec/cd.rs b/vm/devices/iommu/smmu/src/spec/cd.rs
new file mode 100644
index 0000000000..9d4d827df3
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/cd.rs
@@ -0,0 +1,445 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 Context Descriptor (CD) definitions.
+//!
+//! Each CD is 64 bytes (512 bits). The CD contains stage 1 translation table
+//! pointers and ASID for a given stream/substream.
+
+use bitfield_struct::bitfield;
+use open_enum::open_enum;
+use zerocopy::FromBytes;
+use zerocopy::Immutable;
+use zerocopy::IntoBytes;
+use zerocopy::KnownLayout;
+
+/// Context descriptor size in bytes.
+pub const CD_SIZE: usize = 64;
+
+/// Context descriptor (64 bytes).
+#[repr(C)]
+#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct Cd {
+    /// Quadword 0: T0SZ, TG0, cacheability, EPD0, V, IPS, ASID, etc.
+    pub qw0: CdDw0,
+    /// Quadword 1: TTB0.
+    pub qw1: CdDw1,
+    /// Quadword 2: TTB1 (unused for TTB0-only translation).
+    pub _qw2: u64,
+    /// MAIR0 (Memory Attribute Indirection Register 0).
+    pub mair0: u64,
+    /// MAIR1 (Memory Attribute Indirection Register 1).
+    pub mair1: u64,
+    /// Quadwords 5-7: AMAIR, PARTID, permission indirection, etc.
+    pub _qw5_7: [u64; 3],
+}
+
+impl Cd {
+    /// Returns true if the CD is valid (V bit set).
+    pub fn valid(&self) -> bool {
+        self.qw0.v()
+    }
+
+    /// Returns the TTB0 physical address.
+    ///
+    /// TTB0 is stored in QW1 as address bits `[55:4]`.
+    /// The actual address is the stored value shifted left by 4.
+    pub fn ttb0(&self) -> u64 {
+        self.qw1.ttb0() << 4
+    }
+
+    /// Returns T0SZ (VA region size for TTB0).
+    pub fn t0sz(&self) -> u8 {
+        self.qw0.t0sz()
+    }
+
+    /// Returns TG0 (granule size for TTB0).
+    pub fn tg0(&self) -> Tg0 {
+        Tg0(self.qw0.tg0())
+    }
+
+    /// Returns IPS (intermediate physical address size).
+    pub fn ips(&self) -> Ips {
+        Ips(self.qw0.ips())
+    }
+
+    /// Returns the ASID.
+    pub fn asid(&self) -> u16 {
+        self.qw0.asid()
+    }
+
+    /// Returns true if AA64 mode (VMSAv8-64) is selected.
+    pub fn aa64(&self) -> bool {
+        self.qw0.aa64()
+    }
+
+    /// Returns true if TTB0 walks are disabled (EPD0=1).
+    pub fn epd0(&self) -> bool {
+        self.qw0.epd0()
+    }
+}
+
+/// CD QW0 (bits `[63:0]`): T0SZ, TG0, cacheability, EPD0, V, IPS, ASID, etc.
+#[bitfield(u64)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct CdDw0 {
+    /// VA region size for TTB0. VA range = 2^(64 - T0SZ).
+    #[bits(6)]
+    pub t0sz: u8,
+    /// TTB0 granule size.
+    #[bits(2)]
+    pub tg0: u8,
+    /// TTB0 inner cacheability.
+    #[bits(2)]
+    pub ir0: u8,
+    /// TTB0 outer cacheability.
+    #[bits(2)]
+    pub or0: u8,
+    /// TTB0 shareability.
+    #[bits(2)]
+    pub sh0: u8,
+    /// Disable TTB0 walk (1 = fault on miss).
+    pub epd0: bool,
+    /// Translation table endianness (0=LE, 1=BE).
+    pub endi: bool,
+    /// VA region size for TTB1.
+    #[bits(6)]
+    pub t1sz: u8,
+    /// TTB1 granule size.
+    #[bits(2)]
+    pub tg1: u8,
+    /// TTB1 inner cacheability.
+    #[bits(2)]
+    pub ir1: u8,
+    /// TTB1 outer cacheability.
+    #[bits(2)]
+    pub or1: u8,
+    /// TTB1 shareability.
+    #[bits(2)]
+    pub sh1: u8,
+    /// Disable TTB1 walk.
+    pub epd1: bool,
+    /// CD valid bit.
+    pub v: bool,
+    /// Intermediate physical address size.
+    #[bits(3)]
+    pub ips: u8,
+    /// Access flag fault disable.
+    pub affd: bool,
+    /// HW dirty bit management.
+    pub hd: bool,
+    /// HW access flag update.
+    pub ha: bool,
+    /// Stall (0=terminate, 1=stall on fault).
+    pub s: bool,
+    /// Non-shareable → OSH upgrade.
+    pub r: bool,
+    /// Abort flag.
+    pub a: bool,
+    /// ASID set (for TLB invalidation).
+    pub aset: bool,
+    /// Top byte ignore for TTB0 addresses.
+    pub tbi0: bool,
+    /// Top byte ignore for TTB1 addresses.
+    pub tbi1: bool,
+    /// Privileged Access Never.
+    pub pan: bool,
+    /// VMSAv8-64 mode (must be 1 for AArch64 page tables).
+    pub aa64: bool,
+    /// Write implies XN.
+    pub wxn: bool,
+    /// Unprivileged write implies XN.
+    pub uwxn: bool,
+    /// ASID (16-bit).
+    #[bits(16)]
+    pub asid: u16,
+}
+
+/// CD QW1 (bits `[127:64]`): TTB0.
+#[bitfield(u64)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct CdDw1 {
+    /// Control bits (HAFT, E0PD0, NSCFG0, DisCH0).
+    #[bits(4)]
+    pub control: u8,
+    /// TTB0 address bits `[55:4]`. Actual address = stored << 4.
+    #[bits(52)]
+    pub ttb0: u64,
+    /// HW use fields (HWU0xx).
+    #[bits(4)]
+    pub hwu: u8,
+    /// SKL0 (start level override, if supported).
+    #[bits(2)]
+    pub skl0: u8,
+    #[bits(2)]
+    _reserved: u64,
+}
+
+open_enum! {
+    /// TTB0 granule size (CD DW0 TG0 field).
+    pub enum Tg0: u8 {
+        /// 4KB granule.
+        GRAN_4K = 0b00,
+        /// 64KB granule.
+        GRAN_64K = 0b01,
+        /// 16KB granule.
+        GRAN_16K = 0b10,
+    }
+}
+
+open_enum! {
+    /// Intermediate Physical Address Size (CD DW0 IPS field).
+    pub enum Ips: u8 {
+        /// 32-bit (4GB).
+        IPS_32 = 0b000,
+        /// 36-bit (64GB).
+        IPS_36 = 0b001,
+        /// 40-bit (1TB).
+        IPS_40 = 0b010,
+        /// 42-bit (4TB).
+        IPS_42 = 0b011,
+        /// 44-bit (16TB).
+        IPS_44 = 0b100,
+        /// 48-bit (256TB).
+        IPS_48 = 0b101,
+        /// 52-bit (4PB).
+        IPS_52 = 0b110,
+    }
+}
+
+impl Ips {
+    /// Returns the number of physical address bits for this IPS value,
+    /// or `None` if the value is not a recognized encoding.
+    pub fn bits(self) -> Option<u8> {
+        Some(match self {
+            Self::IPS_32 => 32,
+            Self::IPS_36 => 36,
+            Self::IPS_40 => 40,
+            Self::IPS_42 => 42,
+            Self::IPS_44 => 44,
+            Self::IPS_48 => 48,
+            Self::IPS_52 => 52,
+            _ => return None,
+        })
+    }
+}
+
+impl Tg0 {
+    /// Returns the granule size in bytes, or `None` if the value is not
+    /// a recognized encoding.
+    pub fn granule_size(self) -> Option<u64> {
+        Some(match self {
+            Self::GRAN_4K => 4096,
+            Self::GRAN_16K => 16384,
+            Self::GRAN_64K => 65536,
+            _ => return None,
+        })
+    }
+
+    /// Returns the number of bits per page table level index, or `None`
+    /// if the value is not a recognized encoding.
+    pub fn bits_per_level(self) -> Option<u8> {
+        Some(match self {
+            Self::GRAN_4K => 9,
+            Self::GRAN_16K => 11,
+            Self::GRAN_64K => 13,
+            _ => return None,
+        })
+    }
+
+    /// Returns the page offset bits (log2 of granule size), or `None`
+    /// if the value is not a recognized encoding.
+    pub fn page_shift(self) -> Option<u8> {
+        Some(match self {
+            Self::GRAN_4K => 12,
+            Self::GRAN_16K => 14,
+            Self::GRAN_64K => 16,
+            _ => return None,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_cd_size() {
+        assert_eq!(size_of::<Cd>(), CD_SIZE);
+    }
+
+    #[test]
+    fn test_cd_valid() {
+        let cd = new_cd();
+        assert!(!cd.valid());
+
+        let cd = Cd {
+            qw0: CdDw0::new().with_v(true),
+            ..new_cd()
+        };
+        assert!(cd.valid());
+    }
+
+    #[test]
+    fn test_cd_dw0_fields() {
+        let dw0 = CdDw0::new()
+            .with_t0sz(16)
+            .with_tg0(Tg0::GRAN_4K.0)
+            .with_ir0(0b01) // WB
+            .with_or0(0b01)
+            .with_sh0(0b11) // ISH
+            .with_v(true)
+            .with_ips(Ips::IPS_40.0)
+            .with_aa64(true)
+            .with_asid(42);
+
+        assert_eq!(dw0.t0sz(), 16);
+        assert_eq!(dw0.tg0(), Tg0::GRAN_4K.0);
+        assert_eq!(dw0.ir0(), 0b01);
+        assert_eq!(dw0.or0(), 0b01);
+        assert_eq!(dw0.sh0(), 0b11);
+        assert!(dw0.v());
+        assert_eq!(dw0.ips(), Ips::IPS_40.0);
+        assert!(dw0.aa64());
+        assert_eq!(dw0.asid(), 42);
+        assert!(!dw0.epd0());
+    }
+
+    fn new_cd() -> Cd {
+        Cd {
+            qw0: CdDw0::new(),
+            qw1: CdDw1::new(),
+            _qw2: 0,
+            mair0: 0,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        }
+    }
+
+    #[test]
+    fn test_cd_ttb0_address() {
+        let ttb0_addr: u64 = 0x4000_0000;
+        let cd = Cd {
+            qw1: CdDw1::new().with_ttb0(ttb0_addr >> 4),
+            ..new_cd()
+        };
+        assert_eq!(cd.ttb0(), ttb0_addr);
+    }
+
+    #[test]
+    fn test_cd_ttb0_large_address() {
+        let ttb0_addr: u64 = 0x00FF_FFFF_F000;
+        let cd = Cd {
+            qw1: CdDw1::new().with_ttb0(ttb0_addr >> 4),
+            ..new_cd()
+        };
+        assert_eq!(cd.ttb0(), ttb0_addr);
+    }
+
+    #[test]
+    fn test_cd_full_roundtrip() {
+        let ttb0_addr: u64 = 0x8000_0000;
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_t0sz(32)
+                .with_tg0(Tg0::GRAN_4K.0)
+                .with_ir0(0b01)
+                .with_or0(0b01)
+                .with_sh0(0b11)
+                .with_v(true)
+                .with_ips(Ips::IPS_40.0)
+                .with_aa64(true)
+                .with_asid(100),
+            qw1: CdDw1::new().with_ttb0(ttb0_addr >> 4),
+            mair0: 0xFF44_0C04_00BB_FF00,
+            ..new_cd()
+        };
+
+        assert!(cd.valid());
+        assert_eq!(cd.t0sz(), 32);
+        assert_eq!(cd.tg0(), Tg0::GRAN_4K);
+        assert_eq!(cd.ips(), Ips::IPS_40);
+        assert!(cd.aa64());
+        assert_eq!(cd.asid(), 100);
+        assert_eq!(cd.ttb0(), ttb0_addr);
+        assert_eq!(cd.mair0, 0xFF44_0C04_00BB_FF00);
+    }
+
+    #[test]
+    fn test_tg0_granule_sizes() {
+        assert_eq!(Tg0::GRAN_4K.granule_size(), Some(4096));
+        assert_eq!(Tg0::GRAN_16K.granule_size(), Some(16384));
+        assert_eq!(Tg0::GRAN_64K.granule_size(), Some(65536));
+        assert_eq!(Tg0(0b11).granule_size(), None);
+    }
+
+    #[test]
+    fn test_tg0_bits_per_level() {
+        assert_eq!(Tg0::GRAN_4K.bits_per_level(), Some(9));
+        assert_eq!(Tg0::GRAN_16K.bits_per_level(), Some(11));
+        assert_eq!(Tg0::GRAN_64K.bits_per_level(), Some(13));
+    }
+
+    #[test]
+    fn test_tg0_page_shift() {
+        assert_eq!(Tg0::GRAN_4K.page_shift(), Some(12));
+        assert_eq!(Tg0::GRAN_16K.page_shift(), Some(14));
+        assert_eq!(Tg0::GRAN_64K.page_shift(), Some(16));
+    }
+
+    #[test]
+    fn test_ips_bits() {
+        assert_eq!(Ips::IPS_32.bits(), Some(32));
+        assert_eq!(Ips::IPS_36.bits(), Some(36));
+        assert_eq!(Ips::IPS_40.bits(), Some(40));
+        assert_eq!(Ips::IPS_42.bits(), Some(42));
+        assert_eq!(Ips::IPS_44.bits(), Some(44));
+        assert_eq!(Ips::IPS_48.bits(), Some(48));
+        assert_eq!(Ips::IPS_52.bits(), Some(52));
+        assert_eq!(Ips(0b111).bits(), None);
+    }
+
+    #[test]
+    fn test_cd_invalid() {
+        let cd = new_cd();
+        assert!(!cd.valid());
+    }
+
+    #[test]
+    fn test_cd_epd0_disables_walk() {
+        let cd = Cd {
+            qw0: CdDw0::new().with_v(true).with_epd0(true),
+            ..new_cd()
+        };
+
+        assert!(cd.valid());
+        assert!(cd.epd0());
+    }
+
+    #[test]
+    fn test_translation_context_from_cd() {
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_t0sz(16) // 48-bit VA
+                .with_tg0(Tg0::GRAN_4K.0)
+                .with_ips(Ips::IPS_48.0)
+                .with_v(true)
+                .with_aa64(true),
+            ..new_cd()
+        };
+
+        let tg0 = cd.tg0();
+        let va_bits = 64 - cd.t0sz() as u32;
+        let page_shift = tg0.page_shift().unwrap() as u32;
+        let bits_per_level = tg0.bits_per_level().unwrap() as u32;
+
+        assert_eq!(va_bits, 48);
+        assert_eq!(page_shift, 12);
+        assert_eq!(bits_per_level, 9);
+
+        // For 4K/48-bit: start at level 0, 4 levels
+        let total_bits = va_bits - page_shift;
+        let num_levels = total_bits.div_ceil(bits_per_level);
+        assert_eq!(num_levels, 4);
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/spec/commands.rs b/vm/devices/iommu/smmu/src/spec/commands.rs
new file mode 100644
index 0000000000..871c600fc7
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/commands.rs
@@ -0,0 +1,299 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 command queue entry definitions.
+//!
+//! Command queue entries are 16 bytes (128 bits). The opcode is in bits `[7:0]`
+//! of the first dword.
+
+use bitfield_struct::bitfield;
+use open_enum::open_enum;
+use zerocopy::FromBytes;
+use zerocopy::Immutable;
+use zerocopy::IntoBytes;
+use zerocopy::KnownLayout;
+
+open_enum! {
+    /// Command queue opcodes.
+    #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+    pub enum CmdOpcode: u8 {
+        /// Prefetch configuration.
+        PREFETCH_CFG = 0x01,
+        /// Invalidate cached STE.
+        CFGI_STE = 0x03,
+        /// Invalidate cached STE range (with Range=31 for ALL).
+        CFGI_STE_RANGE = 0x04,
+        /// Invalidate cached context descriptor.
+        CFGI_CD = 0x05,
+        /// Invalidate all cached CDs for a stream.
+        CFGI_CD_ALL = 0x06,
+        /// Invalidate all non-Hyp TLB entries.
+        TLBI_NH_ALL = 0x10,
+        /// Invalidate non-Hyp TLB entries by ASID.
+        TLBI_NH_ASID = 0x11,
+        /// Invalidate non-Hyp TLB entry by VA.
+        TLBI_NH_VA = 0x12,
+        /// Invalidate non-Hyp TLB entry by VA (all ASIDs).
+        TLBI_NH_VAA = 0x13,
+        /// Invalidate all stage 1+2 TLB entries for a VMID.
+        TLBI_S12_VMALL = 0x28,
+        /// Invalidate all non-secure non-Hyp TLB entries.
+        TLBI_NSNH_ALL = 0x30,
+        /// Synchronization command.
+        CMD_SYNC = 0x46,
+    }
+}
+
+/// Raw command queue entry (16 bytes = 2 quadwords).
+///
+/// Commands are parsed by reading the opcode from the first byte of `qw0`,
+/// then interpreting the remaining fields based on the command type.
+#[repr(C)]
+#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct CmdEntry {
+    /// First quadword — contains the opcode and command-specific fields.
+    pub qw0: u64,
+    /// Second quadword — contains address or other extended fields.
+    pub qw1: u64,
+}
+
+impl CmdEntry {
+    /// Returns the command opcode (bits `[7:0]` of qw0).
+    pub fn opcode(&self) -> CmdOpcode {
+        CmdOpcode((self.qw0 & 0xFF) as u8)
+    }
+}
+
+/// CMD_CFGI_STE (opcode 0x03): Invalidate cached STE.
+#[bitfield(u64)]
+pub struct CmdCfgiSte {
+    /// Opcode (bits `[7:0]`).
+    #[bits(8)]
+    pub opcode: u8,
+    /// SSec (bit 8) — non-secure.
+    pub ssec: bool,
+    #[bits(23)]
+    _reserved0: u32,
+    /// StreamID (bits `[63:32]`).
+    #[bits(32)]
+    pub sid: u32,
+}
+
+/// CMD_CFGI_STE_RANGE (opcode 0x04): Invalidate cached STE range.
+///
+/// When Range=31, this is CMD_CFGI_ALL (invalidate all STEs).
+#[bitfield(u64)]
+pub struct CmdCfgiSteRange {
+    /// Opcode (bits `[7:0]`).
+    #[bits(8)]
+    pub opcode: u8,
+    /// SSec (bit 8) — non-secure.
+    pub ssec: bool,
+    #[bits(23)]
+    _reserved0: u32,
+    /// StreamID (bits `[63:32]`).
+    #[bits(32)]
+    pub sid: u32,
+}
+
+impl CmdCfgiSteRange {
+    /// The range field is in bits `[68:64]` of the full 128-bit entry (low bits of qw1).
+    pub fn range_from_entry(entry: &CmdEntry) -> u8 {
+        (entry.qw1 & 0x1F) as u8
+    }
+
+    /// Range=31 means invalidate ALL STEs.
+    pub const RANGE_ALL: u8 = 31;
+}
+
+/// CMD_CFGI_CD (opcode 0x05): Invalidate cached context descriptor.
+#[bitfield(u64)]
+pub struct CmdCfgiCd {
+    /// Opcode (bits `[7:0]`).
+    #[bits(8)]
+    pub opcode: u8,
+    /// SSec (bit 8) — non-secure.
+    pub ssec: bool,
+    #[bits(3)]
+    _reserved0: u32,
+    /// SubstreamID (bits `[31:12]`).
+    #[bits(20)]
+    pub ssid: u32,
+    /// StreamID (bits `[63:32]`).
+    #[bits(32)]
+    pub sid: u32,
+}
+
+/// CMD_TLBI_NH_ASID (opcode 0x11): Invalidate TLB by ASID.
+#[bitfield(u64)]
+pub struct CmdTlbiNhAsid {
+    /// Opcode (bits `[7:0]`).
+    #[bits(8)]
+    pub opcode: u8,
+    #[bits(24)]
+    _reserved0: u32,
+    /// VMID (bits `[47:32]`).
+    #[bits(16)]
+    pub vmid: u16,
+    /// ASID (bits `[63:48]`).
+    #[bits(16)]
+    pub asid: u16,
+}
+
+/// CMD_TLBI_NH_VA (opcode 0x12): Invalidate TLB by virtual address.
+#[bitfield(u64)]
+pub struct CmdTlbiNhVa {
+    /// Opcode (bits `[7:0]`).
+    #[bits(8)]
+    pub opcode: u8,
+    #[bits(24)]
+    _reserved0: u32,
+    /// VMID (bits `[47:32]`).
+    #[bits(16)]
+    pub vmid: u16,
+    /// ASID (bits `[63:48]`).
+    #[bits(16)]
+    pub asid: u16,
+}
+
+impl CmdTlbiNhVa {
+    /// The address field is in bits `[127:68]` of the full 128-bit entry.
+    /// This extracts the VA from the raw entry (address bits `[63:12]`).
+    pub fn addr_from_entry(entry: &CmdEntry) -> u64 {
+        let shifted = entry.qw1 >> 4; // bits [127:68] → bits [59:0]
+        (shifted & ((1u64 << 52) - 1)) << 12
+    }
+
+    /// Leaf bit is at bit 64 of the 128-bit entry (bit 0 of qw1).
+    pub fn leaf_from_entry(entry: &CmdEntry) -> bool {
+        entry.qw1 & 1 != 0
+    }
+}
+
+/// CMD_SYNC (opcode 0x46): Synchronization command.
+#[bitfield(u64)]
+pub struct CmdSync {
+    /// Opcode (bits `[7:0]`).
+    #[bits(8)]
+    pub opcode: u8,
+    #[bits(4)]
+    _reserved0: u32,
+    /// Completion signal type (bits `[13:12]`).
+    #[bits(2)]
+    pub cs: u8,
+    #[bits(8)]
+    _reserved1: u32,
+    /// MSI shareability (bits `[23:22]`).
+    #[bits(2)]
+    pub msh: u8,
+    /// MSI attributes (bits `[27:24]`).
+    #[bits(4)]
+    pub msi_attr: u8,
+    #[bits(4)]
+    _reserved2: u32,
+    /// MSI data (bits `[63:32]`).
+    #[bits(32)]
+    pub msi_data: u32,
+}
+
+impl CmdSync {
+    /// Extract the MSI address from the full 128-bit command entry.
+    /// MSI address is in bits `[119:66]` → address `[55:2]`.
+    pub fn msi_addr_from_entry(entry: &CmdEntry) -> u64 {
+        let shifted = entry.qw1 >> 2; // bits [119:66] → bits [53:0]
+        shifted & ((1u64 << 54) - 1)
+    }
+
+    /// Returns the full MSI address (with bits `[1:0]` = 0).
+    pub fn msi_write_addr_from_entry(entry: &CmdEntry) -> u64 {
+        Self::msi_addr_from_entry(entry) << 2
+    }
+}
+
+open_enum! {
+    /// CMD_SYNC completion signal types.
+    pub enum SyncCs: u8 {
+        /// No signal.
+        SIG_NONE = 0b00,
+        /// Send MSI/IRQ.
+        SIG_IRQ = 0b01,
+        /// Send SEV wakeup event.
+        SIG_SEV = 0b10,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_cmd_opcodes() {
+        assert_eq!(CmdOpcode::PREFETCH_CFG.0, 0x01);
+        assert_eq!(CmdOpcode::CFGI_STE.0, 0x03);
+        assert_eq!(CmdOpcode::CFGI_STE_RANGE.0, 0x04);
+        assert_eq!(CmdOpcode::CFGI_CD.0, 0x05);
+        assert_eq!(CmdOpcode::CFGI_CD_ALL.0, 0x06);
+        assert_eq!(CmdOpcode::TLBI_NH_ALL.0, 0x10);
+        assert_eq!(CmdOpcode::TLBI_NH_ASID.0, 0x11);
+        assert_eq!(CmdOpcode::TLBI_NH_VA.0, 0x12);
+        assert_eq!(CmdOpcode::TLBI_NH_VAA.0, 0x13);
+        assert_eq!(CmdOpcode::TLBI_NSNH_ALL.0, 0x30);
+        assert_eq!(CmdOpcode::CMD_SYNC.0, 0x46);
+    }
+
+    #[test]
+    fn test_cmd_entry_opcode() {
+        let entry = CmdEntry { qw0: 0x46, qw1: 0 };
+        assert_eq!(entry.opcode(), CmdOpcode::CMD_SYNC);
+    }
+
+    #[test]
+    fn test_cmd_cfgi_ste_sid() {
+        let cmd = CmdCfgiSte::new()
+            .with_opcode(CmdOpcode::CFGI_STE.0)
+            .with_sid(42);
+        assert_eq!(cmd.opcode(), CmdOpcode::CFGI_STE.0);
+        assert_eq!(cmd.sid(), 42);
+    }
+
+    #[test]
+    fn test_cmd_sync_fields() {
+        let cmd = CmdSync::new()
+            .with_opcode(CmdOpcode::CMD_SYNC.0)
+            .with_cs(SyncCs::SIG_IRQ.0)
+            .with_msi_data(0xDEAD_BEEF);
+        assert_eq!(cmd.opcode(), CmdOpcode::CMD_SYNC.0);
+        assert_eq!(cmd.cs(), SyncCs::SIG_IRQ.0);
+        assert_eq!(cmd.msi_data(), 0xDEAD_BEEF);
+    }
+
+    #[test]
+    fn test_cmd_sync_msi_addr() {
+        // MSI address = 0x1234_5678
+        // Stored in qw1 bits [55:2] as (addr >> 2) << 2
+        let addr: u64 = 0x1234_5678;
+        let addr_shifted = addr >> 2;
+        let entry = CmdEntry {
+            qw0: CmdOpcode::CMD_SYNC.0 as u64,
+            qw1: addr_shifted << 2,
+        };
+        assert_eq!(CmdSync::msi_write_addr_from_entry(&entry), addr & !0x3);
+    }
+
+    #[test]
+    fn test_cfgi_ste_range_all() {
+        let entry = CmdEntry {
+            qw0: CmdOpcode::CFGI_STE_RANGE.0 as u64,
+            qw1: 31,
+        };
+        assert_eq!(
+            CmdCfgiSteRange::range_from_entry(&entry),
+            CmdCfgiSteRange::RANGE_ALL
+        );
+    }
+
+    #[test]
+    fn test_cmd_entry_size() {
+        assert_eq!(size_of::<CmdEntry>(), 16);
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/spec/events.rs b/vm/devices/iommu/smmu/src/spec/events.rs
new file mode 100644
index 0000000000..3c85f01167
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/events.rs
@@ -0,0 +1,265 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 event queue entry definitions.
+//!
+//! Event queue entries are 32 bytes (256 bits). The event type is in bits `[7:0]`
+//! of the first dword.
+
+use bitfield_struct::bitfield;
+use open_enum::open_enum;
+use zerocopy::FromBytes;
+use zerocopy::Immutable;
+use zerocopy::IntoBytes;
+use zerocopy::KnownLayout;
+
+open_enum! {
+    /// Event queue record types.
+    #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+    pub enum EventId: u8 {
+        /// Unsupported upstream transaction.
+        F_UUT = 0x01,
+        /// StreamID out of range.
+        C_BAD_STREAMID = 0x02,
+        /// STE fetch external abort.
+        F_STE_FETCH = 0x03,
+        /// Bad STE configuration.
+        C_BAD_STE = 0x04,
+        /// Bad ATS translation request.
+        F_BAD_ATS_TREQ = 0x05,
+        /// Stream disabled.
+        F_STREAM_DISABLED = 0x06,
+        /// ATS translated traffic forbidden.
+        F_TRANSL_FORBIDDEN = 0x07,
+        /// Bad SubstreamID.
+        C_BAD_SUBSTREAMID = 0x08,
+        /// CD fetch external abort.
+        F_CD_FETCH = 0x09,
+        /// Bad CD configuration.
+        C_BAD_CD = 0x0A,
+        /// Translation table walk external abort.
+        F_WALK_EABT = 0x0B,
+        /// Translation fault.
+        F_TRANSLATION = 0x10,
+        /// Address size fault.
+        F_ADDR_SIZE = 0x11,
+        /// Access flag fault.
+        F_ACCESS = 0x12,
+        /// Permission fault.
+        F_PERMISSION = 0x13,
+        /// TLB conflict.
+        F_TLB_CONFLICT = 0x14,
+    }
+}
+
+/// Event queue entry (32 bytes).
+#[repr(C)]
+#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct EvtEntry {
+    /// Event type and SubstreamID info.
+    pub header: EvtHeader,
+    /// StreamID of the faulting device.
+    pub sid: u32,
+    /// Fault flags (RnW, S2, CLASS, etc.).
+    pub flags: EvtFlags,
+    /// Reserved / STAG.
+    pub _stag: u32,
+    /// Faulting input address (64-bit).
+    pub input_addr: u64,
+    /// Fetch address or reserved (64-bit).
+    pub _fetch_addr: u64,
+}
+
+/// Event entry header (first 32 bits).
+#[bitfield(u32)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct EvtHeader {
+    /// Event type.
+    #[bits(8)]
+    pub event_id: u8,
+    #[bits(2)]
+    _reserved0: u32,
+    /// SubstreamID valid.
+    pub ssv: bool,
+    #[bits(1)]
+    _reserved1: u32,
+    /// SubstreamID (upper bits).
+    #[bits(20)]
+    pub ssid: u32,
+}
+
+/// Event entry flags (third 32-bit word).
+#[bitfield(u32)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct EvtFlags {
+    /// Privileged/Unprivileged.
+    pub pnu: bool,
+    /// Instruction/Data.
+    pub ind: bool,
+    /// Read (true) / Write (false).
+    pub rnw: bool,
+    /// Stage 2 fault (false = S1 fault).
+    pub s2: bool,
+    /// Fault class.
+    #[bits(2)]
+    pub class: u8,
+    #[bits(26)]
+    _reserved: u32,
+}
+
+impl EvtEntry {
+    /// Size of an event queue entry in bytes.
+    pub const SIZE: usize = 32;
+
+    /// Creates a new zeroed event entry.
+    pub fn new() -> Self {
+        Self {
+            header: EvtHeader::new(),
+            sid: 0,
+            flags: EvtFlags::new(),
+            _stag: 0,
+            input_addr: 0,
+            _fetch_addr: 0,
+        }
+    }
+
+    /// Returns the event type.
+    pub fn event_id(&self) -> EventId {
+        EventId(self.header.event_id())
+    }
+
+    /// Creates a translation fault event.
+    pub fn translation_fault(sid: u32, iova: u64, write: bool) -> Self {
+        Self {
+            header: EvtHeader::new().with_event_id(EventId::F_TRANSLATION.0),
+            sid,
+            flags: EvtFlags::new().with_rnw(!write),
+            input_addr: iova,
+            ..Self::new()
+        }
+    }
+
+    /// Creates a permission fault event.
+    pub fn permission_fault(sid: u32, iova: u64, write: bool) -> Self {
+        Self {
+            header: EvtHeader::new().with_event_id(EventId::F_PERMISSION.0),
+            sid,
+            flags: EvtFlags::new().with_rnw(!write),
+            input_addr: iova,
+            ..Self::new()
+        }
+    }
+
+    /// Creates an access flag fault event.
+    pub fn access_fault(sid: u32, iova: u64, write: bool) -> Self {
+        Self {
+            header: EvtHeader::new().with_event_id(EventId::F_ACCESS.0),
+            sid,
+            flags: EvtFlags::new().with_rnw(!write),
+            input_addr: iova,
+            ..Self::new()
+        }
+    }
+
+    /// Creates an address size fault event.
+    pub fn addr_size_fault(sid: u32, iova: u64, write: bool) -> Self {
+        Self {
+            header: EvtHeader::new().with_event_id(EventId::F_ADDR_SIZE.0),
+            sid,
+            flags: EvtFlags::new().with_rnw(!write),
+            input_addr: iova,
+            ..Self::new()
+        }
+    }
+
+    /// Creates a bad STE event.
+    pub fn bad_ste(sid: u32) -> Self {
+        Self {
+            header: EvtHeader::new().with_event_id(EventId::C_BAD_STE.0),
+            sid,
+            ..Self::new()
+        }
+    }
+
+    /// Creates a bad CD event.
+    pub fn bad_cd(sid: u32) -> Self {
+        Self {
+            header: EvtHeader::new().with_event_id(EventId::C_BAD_CD.0),
+            sid,
+            ..Self::new()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_event_ids() {
+        assert_eq!(EventId::F_UUT.0, 0x01);
+        assert_eq!(EventId::C_BAD_STREAMID.0, 0x02);
+        assert_eq!(EventId::C_BAD_STE.0, 0x04);
+        assert_eq!(EventId::C_BAD_CD.0, 0x0A);
+        assert_eq!(EventId::F_TRANSLATION.0, 0x10);
+        assert_eq!(EventId::F_ADDR_SIZE.0, 0x11);
+        assert_eq!(EventId::F_ACCESS.0, 0x12);
+        assert_eq!(EventId::F_PERMISSION.0, 0x13);
+    }
+
+    #[test]
+    fn test_evt_entry_size() {
+        assert_eq!(size_of::<EvtEntry>(), 32);
+    }
+
+    #[test]
+    fn test_evt_entry_translation_fault() {
+        let evt = EvtEntry::translation_fault(0x42, 0x1000_2000, true);
+        assert_eq!(evt.event_id(), EventId::F_TRANSLATION);
+        assert_eq!(evt.sid, 0x42);
+        assert_eq!(evt.input_addr, 0x1000_2000);
+        // write → RnW = false (not-read)
+        assert!(!evt.flags.rnw());
+    }
+
+    #[test]
+    fn test_evt_entry_permission_fault() {
+        let evt = EvtEntry::permission_fault(0x10, 0xFFFF_0000, false);
+        assert_eq!(evt.event_id(), EventId::F_PERMISSION);
+        assert_eq!(evt.sid, 0x10);
+        assert_eq!(evt.input_addr, 0xFFFF_0000);
+        // read → RnW = true
+        assert!(evt.flags.rnw());
+    }
+
+    #[test]
+    fn test_evt_entry_bad_ste() {
+        let evt = EvtEntry::bad_ste(0x100);
+        assert_eq!(evt.event_id(), EventId::C_BAD_STE);
+        assert_eq!(evt.sid, 0x100);
+    }
+
+    #[test]
+    fn test_evt_entry_access_fault() {
+        let evt = EvtEntry::access_fault(5, 0xDEAD_BEEF_0000, true);
+        assert_eq!(evt.event_id(), EventId::F_ACCESS);
+        assert_eq!(evt.sid, 5);
+        assert_eq!(evt.input_addr, 0xDEAD_BEEF_0000);
+    }
+
+    #[test]
+    fn test_evt_entry_roundtrip() {
+        let evt = EvtEntry {
+            header: EvtHeader::new().with_event_id(EventId::F_ADDR_SIZE.0),
+            sid: 0xABCD,
+            flags: EvtFlags::new().with_rnw(true),
+            input_addr: 0x1234_5678_9ABC_DEF0,
+            ..EvtEntry::new()
+        };
+
+        assert_eq!(evt.event_id(), EventId::F_ADDR_SIZE);
+        assert_eq!(evt.sid, 0xABCD);
+        assert_eq!(evt.input_addr, 0x1234_5678_9ABC_DEF0);
+        assert!(evt.flags.rnw());
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/spec/mod.rs b/vm/devices/iommu/smmu/src/spec/mod.rs
new file mode 100644
index 0000000000..5cadef514f
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/mod.rs
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 spec-derived type definitions.
+//!
+//! Register layouts, stream table entries, context descriptors, command/event
+//! queue entries, and page table descriptors — all derived from the Arm SMMUv3
+//! architecture specification (IHI 0070).
+//!
+//! This module contains only type definitions, not algorithms.
+
+pub mod cd;
+pub mod commands;
+pub mod events;
+pub mod pt;
+pub mod registers;
+pub mod ste;
diff --git a/vm/devices/iommu/smmu/src/spec/pt.rs b/vm/devices/iommu/smmu/src/spec/pt.rs
new file mode 100644
index 0000000000..947bfc60ab
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/pt.rs
@@ -0,0 +1,396 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! AArch64 VMSAv8 stage 1 page table descriptor definitions.
+//!
+//! The SMMU uses the same page table format as AArch64 PE stage 1 translation.
+//! These are the standard ARMv8 translation table descriptors defined in the
+//! Arm Architecture Reference Manual (DDI 0487).
+
+use bitfield_struct::bitfield;
+use open_enum::open_enum;
+use zerocopy::FromBytes;
+use zerocopy::Immutable;
+use zerocopy::IntoBytes;
+use zerocopy::KnownLayout;
+
+/// A 64-bit page table descriptor.
+///
+/// The interpretation depends on the level and the Type bit:
+/// - Level 0-2, Type=1: Table descriptor (points to next-level table)
+/// - Level 1-2, Type=0: Block descriptor (maps a large region)
+/// - Level 3, Type=1: Page descriptor (maps a single page)
+/// - Level 3, Type=0: Reserved (invalid)
+/// - Valid=0: Invalid/fault entry
+#[bitfield(u64)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct PtDesc {
+    /// Valid bit. 0 = fault entry.
+    pub valid: bool,
+    /// Descriptor type. 1 = table/page, 0 = block (or reserved at L3).
+    pub desc_type: bool,
+    /// Memory attribute index (indexes into MAIR).
+    #[bits(3)]
+    pub attr_index: u8,
+    /// Non-secure bit.
+    pub ns: bool,
+    /// Access permissions.
+    #[bits(2)]
+    pub ap: u8,
+    /// Shareability.
+    #[bits(2)]
+    pub sh: u8,
+    /// Access flag. Must be 1 to avoid AF faults (when HTTU not supported).
+    pub af: bool,
+    /// Not-global (if 1, uses ASID for TLB matching).
+    pub ng: bool,
+    /// Output address / next-level table address bits `[47:12]`.
+    /// For 4KB granule: block at L1 uses `[47:30]`, block at L2 uses `[47:21]`,
+    /// page at L3 uses `[47:12]`.
+    #[bits(36)]
+    pub addr_bits: u64,
+    /// Reserved / upper attributes bits `[49:48]`.
+    #[bits(2)]
+    _reserved_upper: u64,
+    /// Guarded page.
+    pub gp: bool,
+    /// Dirty bit modifier.
+    pub dbm: bool,
+    /// Contiguous hint.
+    pub contiguous: bool,
+    /// Privileged execute-never.
+    pub pxn: bool,
+    /// Unprivileged execute-never (or XN for EL2/EL3).
+    pub uxn: bool,
+    /// Software use / PBHA.
+    #[bits(4)]
+    pub sw_use: u8,
+    /// Ignored / PBHA.
+    #[bits(5)]
+    pub ignored_upper: u8,
+}
+
+impl PtDesc {
+    /// Returns true if this is a valid entry.
+    pub fn is_valid(&self) -> bool {
+        self.valid()
+    }
+
+    /// Returns true if this is a table descriptor (levels 0-2) or page
+    /// descriptor (level 3). Type bit = 1.
+    pub fn is_table(&self) -> bool {
+        self.valid() && self.desc_type()
+    }
+
+    /// Returns true if this is a block descriptor (levels 1-2).
+    /// Valid=1 and Type=0.
+    pub fn is_block(&self) -> bool {
+        self.valid() && !self.desc_type()
+    }
+
+    /// Returns true if this is a page descriptor at level 3.
+    /// At L3, Valid=1 and Type=1 means page. Type=0 is reserved/fault.
+    pub fn is_page_at_l3(&self) -> bool {
+        self.valid() && self.desc_type()
+    }
+
+    /// Returns the output address for a 4KB granule.
+    ///
+    /// For table descriptors: the next-level table address (bits `[47:12]`).
+    /// For block descriptors at L1: bits `[47:30]` (1GB block).
+    /// For block descriptors at L2: bits `[47:21]` (2MB block).
+    /// For page descriptors at L3: bits `[47:12]` (4KB page).
+    pub fn output_address_4k(&self, level: u8) -> u64 {
+        let raw = self.addr_bits() << 12;
+        match level {
+            0 => raw, // table only at L0 for 4K
+            1 => {
+                if self.is_block() {
+                    raw & !((1u64 << 30) - 1) // 1GB aligned
+                } else {
+                    raw // table address
+                }
+            }
+            2 => {
+                if self.is_block() {
+                    raw & !((1u64 << 21) - 1) // 2MB aligned
+                } else {
+                    raw // table address
+                }
+            }
+            3 => raw, // page address, 4KB aligned
+            _ => raw,
+        }
+    }
+
+    /// Returns the output address for a 16KB granule.
+    pub fn output_address_16k(&self, level: u8) -> u64 {
+        let raw = self.addr_bits() << 12;
+        match level {
+            // L1 block: 64GB (bits [47:36])
+            1 => {
+                if self.is_block() {
+                    raw & !((1u64 << 36) - 1)
+                } else {
+                    raw
+                }
+            }
+            // L2 block: 32MB (bits [47:25])
+            2 => {
+                if self.is_block() {
+                    raw & !((1u64 << 25) - 1)
+                } else {
+                    raw
+                }
+            }
+            3 => raw, // page address, 16KB aligned
+            _ => raw,
+        }
+    }
+
+    /// Returns the output address for a 64KB granule.
+    pub fn output_address_64k(&self, level: u8) -> u64 {
+        let raw = self.addr_bits() << 12;
+        match level {
+            // L2 block: 512MB (bits [47:29])
+            2 => {
+                if self.is_block() {
+                    raw & !((1u64 << 29) - 1)
+                } else {
+                    raw
+                }
+            }
+            3 => raw, // page address, 64KB aligned
+            _ => raw,
+        }
+    }
+
+    /// Returns the next-level table address (for table descriptors).
+    /// The table address is always in bits `[47:12]`, page-aligned.
+    pub fn next_table_addr(&self) -> u64 {
+        self.addr_bits() << 12
+    }
+}
+
+open_enum! {
+    /// Access permission bits (AP`[2:1]`).
+    pub enum ApBits: u8 {
+        /// EL1 R/W, EL0 no access.
+        RW_EL1 = 0b00,
+        /// EL1 R/W, EL0 R/W.
+        RW_ANY = 0b01,
+        /// EL1 R/O, EL0 no access.
+        RO_EL1 = 0b10,
+        /// EL1 R/O, EL0 R/O.
+        RO_ANY = 0b11,
+    }
+}
+
+impl ApBits {
+    /// Returns true if the access permissions allow writes.
+    pub fn allows_write(self) -> bool {
+        match self {
+            Self::RW_EL1 | Self::RW_ANY => true,
+            Self::RO_EL1 | Self::RO_ANY => false,
+            _ => false,
+        }
+    }
+
+    /// Returns true if the access permissions allow reads (always true for
+    /// valid permissions).
+    pub fn allows_read(self) -> bool {
+        true
+    }
+}
+
+open_enum! {
+    /// Shareability field values.
+    pub enum Shareability: u8 {
+        /// Non-shareable.
+        NON_SHAREABLE = 0b00,
+        /// Outer shareable.
+        OUTER_SHAREABLE = 0b10,
+        /// Inner shareable.
+        INNER_SHAREABLE = 0b11,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pt_desc_invalid() {
+        let desc = PtDesc::from(0u64);
+        assert!(!desc.is_valid());
+        assert!(!desc.is_table());
+        assert!(!desc.is_block());
+    }
+
+    #[test]
+    fn test_pt_desc_table() {
+        // Valid=1, Type=1 → table descriptor
+        let desc = PtDesc::new().with_valid(true).with_desc_type(true);
+        assert!(desc.is_valid());
+        assert!(desc.is_table());
+        assert!(!desc.is_block());
+    }
+
+    #[test]
+    fn test_pt_desc_block() {
+        // Valid=1, Type=0 → block descriptor
+        let desc = PtDesc::new().with_valid(true).with_desc_type(false);
+        assert!(desc.is_valid());
+        assert!(!desc.is_table());
+        assert!(desc.is_block());
+    }
+
+    #[test]
+    fn test_pt_desc_page_at_l3() {
+        // At L3: Valid=1, Type=1 → page descriptor
+        let desc = PtDesc::new().with_valid(true).with_desc_type(true);
+        assert!(desc.is_page_at_l3());
+    }
+
+    #[test]
+    fn test_pt_desc_4k_page_address() {
+        // 4K page at L3: output address at bits [47:12]
+        let page_addr: u64 = 0x4000_1000;
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(page_addr >> 12);
+
+        assert_eq!(desc.output_address_4k(3), page_addr);
+    }
+
+    #[test]
+    fn test_pt_desc_4k_l2_block_address() {
+        // 2MB block at L2: output address at bits [47:21]
+        let block_addr: u64 = 0x4020_0000; // 2MB aligned
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(false)
+            .with_addr_bits(block_addr >> 12);
+
+        assert_eq!(desc.output_address_4k(2), block_addr);
+    }
+
+    #[test]
+    fn test_pt_desc_4k_l1_block_address() {
+        // 1GB block at L1: output address at bits [47:30]
+        let block_addr: u64 = 0x4000_0000; // 1GB aligned
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(false)
+            .with_addr_bits(block_addr >> 12);
+
+        assert_eq!(desc.output_address_4k(1), block_addr);
+    }
+
+    #[test]
+    fn test_pt_desc_table_next_addr() {
+        let table_addr: u64 = 0x8000_5000;
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(table_addr >> 12);
+
+        assert_eq!(desc.next_table_addr(), table_addr);
+    }
+
+    #[test]
+    fn test_pt_desc_access_flag() {
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_af(true);
+        assert!(desc.af());
+
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_af(false);
+        assert!(!desc.af());
+    }
+
+    #[test]
+    fn test_pt_desc_permissions() {
+        // RW_EL1
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_ap(ApBits::RW_EL1.0);
+        assert_eq!(desc.ap(), ApBits::RW_EL1.0);
+
+        // RO_EL1
+        let desc = desc.with_ap(ApBits::RO_EL1.0);
+        assert_eq!(desc.ap(), ApBits::RO_EL1.0);
+    }
+
+    #[test]
+    fn test_ap_bits_write_permission() {
+        assert!(ApBits::RW_EL1.allows_write());
+        assert!(ApBits::RW_ANY.allows_write());
+        assert!(!ApBits::RO_EL1.allows_write());
+        assert!(!ApBits::RO_ANY.allows_write());
+    }
+
+    #[test]
+    fn test_ap_bits_read_permission() {
+        // All valid AP values allow reads
+        assert!(ApBits::RW_EL1.allows_read());
+        assert!(ApBits::RW_ANY.allows_read());
+        assert!(ApBits::RO_EL1.allows_read());
+        assert!(ApBits::RO_ANY.allows_read());
+    }
+
+    #[test]
+    fn test_pt_desc_full_roundtrip() {
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_attr_index(3)
+            .with_ns(true)
+            .with_ap(ApBits::RO_ANY.0)
+            .with_sh(Shareability::INNER_SHAREABLE.0)
+            .with_af(true)
+            .with_ng(true)
+            .with_addr_bits(0x1234_5000_u64 >> 12)
+            .with_pxn(true)
+            .with_uxn(true);
+
+        assert!(desc.valid());
+        assert!(desc.desc_type());
+        assert_eq!(desc.attr_index(), 3);
+        assert!(desc.ns());
+        assert_eq!(desc.ap(), ApBits::RO_ANY.0);
+        assert_eq!(desc.sh(), Shareability::INNER_SHAREABLE.0);
+        assert!(desc.af());
+        assert!(desc.ng());
+        assert_eq!(desc.next_table_addr(), 0x1234_5000);
+        assert!(desc.pxn());
+        assert!(desc.uxn());
+    }
+
+    #[test]
+    fn test_pt_desc_preserves_page_offset() {
+        // Verify that the output address does not include sub-page bits
+        let page_addr: u64 = 0x8000_3000;
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_addr_bits(page_addr >> 12);
+
+        // At L3, the output is the page base
+        assert_eq!(desc.output_address_4k(3), page_addr);
+        assert_eq!(desc.output_address_4k(3) & 0xFFF, 0);
+    }
+
+    #[test]
+    fn test_shareability_values() {
+        assert_eq!(Shareability::NON_SHAREABLE.0, 0b00);
+        assert_eq!(Shareability::OUTER_SHAREABLE.0, 0b10);
+        assert_eq!(Shareability::INNER_SHAREABLE.0, 0b11);
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/spec/registers.rs b/vm/devices/iommu/smmu/src/spec/registers.rs
new file mode 100644
index 0000000000..42b2f90e1e
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/registers.rs
@@ -0,0 +1,708 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 MMIO register definitions.
+//!
+//! Register offsets and bitfield types from the Arm SMMUv3 architecture
+//! specification (IHI 0070), Chapter 6.
+
+use bitfield_struct::bitfield;
+use inspect::Inspect;
+use open_enum::open_enum;
+
+// =============================================================================
+// MMIO Register Offsets — Page 0 (base + 0x00000)
+// =============================================================================
+
+/// SMMU_IDR0: Feature identification register.
+pub const IDR0: u16 = 0x0000;
+/// SMMU_IDR1: Queue and stream size identification.
+pub const IDR1: u16 = 0x0004;
+/// SMMU_IDR2: Extended feature identification.
+pub const IDR2: u16 = 0x0008;
+/// SMMU_IDR3: Extended feature identification.
+pub const IDR3: u16 = 0x000C;
+/// SMMU_IDR4: Implementation-defined identification.
+pub const IDR4: u16 = 0x0010;
+/// SMMU_IDR5: Granule and output address size.
+pub const IDR5: u16 = 0x0014;
+/// SMMU_IIDR: Implementer identification.
+pub const IIDR: u16 = 0x0018;
+/// SMMU_AIDR: Architecture version identification.
+pub const AIDR: u16 = 0x001C;
+
+/// SMMU_CR0: Control register.
+pub const CR0: u16 = 0x0020;
+/// SMMU_CR0ACK: CR0 acknowledgment (read-only).
+pub const CR0ACK: u16 = 0x0024;
+/// SMMU_CR1: Queue/table access attributes.
+pub const CR1: u16 = 0x0028;
+/// SMMU_CR2: Extended controls.
+pub const CR2: u16 = 0x002C;
+
+/// SMMU_STATUSR: Status register.
+pub const STATUSR: u16 = 0x0040;
+/// SMMU_GBPA: Global bypass attributes.
+pub const GBPA: u16 = 0x0044;
+/// SMMU_AGBPA: Alternate global bypass attributes.
+pub const AGBPA: u16 = 0x0048;
+
+/// SMMU_IRQ_CTRL: Interrupt enable register.
+pub const IRQ_CTRL: u16 = 0x0050;
+/// SMMU_IRQ_CTRLACK: IRQ_CTRL acknowledgment (read-only).
+pub const IRQ_CTRLACK: u16 = 0x0054;
+
+/// SMMU_GERROR: Global error status (read-only, toggle protocol).
+pub const GERROR: u16 = 0x0060;
+/// SMMU_GERRORN: Global error acknowledgment.
+pub const GERRORN: u16 = 0x0064;
+
+/// SMMU_GERROR_IRQ_CFG0: GERROR MSI address (64-bit).
+pub const GERROR_IRQ_CFG0: u16 = 0x0068;
+/// SMMU_GERROR_IRQ_CFG1: GERROR MSI data payload.
+pub const GERROR_IRQ_CFG1: u16 = 0x0070;
+/// SMMU_GERROR_IRQ_CFG2: GERROR MSI attributes.
+pub const GERROR_IRQ_CFG2: u16 = 0x0074;
+
+/// SMMU_STRTAB_BASE: Stream table base address (64-bit).
+pub const STRTAB_BASE: u16 = 0x0080;
+/// SMMU_STRTAB_BASE_CFG: Stream table configuration.
+pub const STRTAB_BASE_CFG: u16 = 0x0088;
+
+/// SMMU_CMDQ_BASE: Command queue base address (64-bit).
+pub const CMDQ_BASE: u16 = 0x0090;
+/// SMMU_CMDQ_PROD: Command queue producer index.
+pub const CMDQ_PROD: u16 = 0x0098;
+/// SMMU_CMDQ_CONS: Command queue consumer index.
+pub const CMDQ_CONS: u16 = 0x009C;
+
+/// SMMU_EVENTQ_BASE: Event queue base address (64-bit).
+pub const EVENTQ_BASE: u16 = 0x00A0;
+
+/// SMMU_EVENTQ_IRQ_CFG0: Event queue MSI address (64-bit).
+pub const EVENTQ_IRQ_CFG0: u16 = 0x00B0;
+/// SMMU_EVENTQ_IRQ_CFG1: Event queue MSI data.
+pub const EVENTQ_IRQ_CFG1: u16 = 0x00B8;
+/// SMMU_EVENTQ_IRQ_CFG2: Event queue MSI attributes.
+pub const EVENTQ_IRQ_CFG2: u16 = 0x00BC;
+
+// =============================================================================
+// MMIO Register Offsets — Page 1 (base + 0x10000)
+// =============================================================================
+
+/// SMMU_EVENTQ_PROD: Event queue producer index (page 1).
+pub const EVENTQ_PROD_PAGE1: u32 = 0x100A8;
+/// SMMU_EVENTQ_CONS: Event queue consumer index (page 1).
+pub const EVENTQ_CONS_PAGE1: u32 = 0x100AC;
+
+/// SMMU_CMDQ_IRQ_CFG0: Command queue MSI address (page 1, 64-bit).
+pub const CMDQ_IRQ_CFG0_PAGE1: u32 = 0x10008;
+/// SMMU_CMDQ_IRQ_CFG1: Command queue MSI data (page 1).
+pub const CMDQ_IRQ_CFG1_PAGE1: u32 = 0x10010;
+/// SMMU_CMDQ_IRQ_CFG2: Command queue MSI attributes (page 1).
+pub const CMDQ_IRQ_CFG2_PAGE1: u32 = 0x10014;
+
+/// Total MMIO region size: page 0 (64KB) + page 1 (64KB) = 128KB.
+pub const MMIO_REGION_SIZE: u64 = 0x20000;
+
+// =============================================================================
+// Bitfield Types — Identification Registers
+// =============================================================================
+
+/// SMMU_IDR0: Feature identification.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Idr0 {
+    /// Stage 2 translation supported.
+    pub s2p: bool,
+    /// Stage 1 translation supported.
+    pub s1p: bool,
+    /// Translation table format.
+    #[bits(2)]
+    pub ttf: u8,
+    /// Coherent access supported.
+    pub cohacc: bool,
+    /// Broadcast TLB maintenance.
+    pub btm: bool,
+    /// Hardware translation table update.
+    #[bits(2)]
+    pub httu: u8,
+    /// Dormant hint.
+    pub dormhint: bool,
+    /// Hypervisor stage.
+    pub hyp: bool,
+    /// ATS supported.
+    pub ats: bool,
+    /// NS1ATS.
+    pub ns1ats: bool,
+    /// 16-bit ASID supported.
+    pub asid16: bool,
+    /// MSI supported.
+    pub msi: bool,
+    /// SEV supported.
+    pub sev: bool,
+    /// ATOS supported.
+    pub atos: bool,
+    /// PRI supported.
+    pub pri: bool,
+    /// VMID wildcard.
+    pub vmw: bool,
+    /// 16-bit VMID supported.
+    pub vmid16: bool,
+    /// 2-level CD table supported.
+    pub cd2l: bool,
+    /// Virtual ATOS.
+    pub vatos: bool,
+    /// Translation table endianness.
+    #[bits(2)]
+    pub ttendian: u8,
+    /// ATS recording error.
+    pub atsrecerr: bool,
+    /// Stall model.
+    #[bits(2)]
+    pub stall_model: u8,
+    /// Terminate model.
+    pub term_model: bool,
+    /// Stream table level.
+    #[bits(2)]
+    pub st_level: u8,
+    #[bits(1)]
+    _reserved: u32,
+    /// RME implementation.
+    pub rme_impl: bool,
+    #[bits(1)]
+    _reserved2: u32,
+}
+
+/// SMMU_IDR1: Queue and stream size identification.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Idr1 {
+    /// StreamID size (number of bits).
+    #[bits(6)]
+    pub sidsize: u8,
+    /// SubstreamID size (number of bits).
+    #[bits(5)]
+    pub ssidsize: u8,
+    /// Reserved.
+    #[bits(5)]
+    _reserved0: u32,
+    /// Max event queue size as log2(entries).
+    #[bits(5)]
+    pub eventqs: u8,
+    /// Max command queue size as log2(entries).
+    #[bits(5)]
+    pub cmdqs: u8,
+    /// Attribute permissions override.
+    pub attr_perms_ovr: bool,
+    /// Attribute types override.
+    pub attr_types_ovr: bool,
+    /// REL (relative base pointers).
+    pub rel: bool,
+    /// Queues preset.
+    pub queues_preset: bool,
+    /// Tables preset.
+    pub tables_preset: bool,
+    /// Enhanced CMDQ.
+    pub ecmdq: bool,
+}
+
+/// SMMU_IDR5: Granule and output address size.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Idr5 {
+    /// Output address size.
+    #[bits(3)]
+    pub oas: u8,
+    #[bits(1)]
+    _reserved0: u32,
+    /// 4KB granule supported.
+    pub gran4k: bool,
+    /// 16KB granule supported.
+    pub gran16k: bool,
+    /// 64KB granule supported.
+    pub gran64k: bool,
+    /// Double-size support.
+    pub ds: bool,
+    /// 128-bit descriptors.
+    pub d128: bool,
+    #[bits(1)]
+    _reserved1: u32,
+    /// VA extension (48 or 52 bit).
+    #[bits(2)]
+    pub vax: u8,
+    #[bits(4)]
+    _reserved2: u32,
+    /// Max stall entries.
+    #[bits(16)]
+    pub stall_max: u16,
+}
+
+// =============================================================================
+// Bitfield Types — Control Registers
+// =============================================================================
+
+/// SMMU_CR0: Control register.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Cr0 {
+    /// SMMU enable.
+    pub smmuen: bool,
+    /// PRI queue enable.
+    pub priqen: bool,
+    /// Event queue enable.
+    pub eventqen: bool,
+    /// Command queue enable.
+    pub cmdqen: bool,
+    /// ATS check enable.
+    pub atschk: bool,
+    #[bits(1)]
+    _reserved0: u32,
+    /// VMW override.
+    #[bits(3)]
+    pub vmw: u8,
+    #[bits(1)]
+    _reserved1: u32,
+    /// DPT walk enable.
+    pub dpt_walk_en: bool,
+    /// VSID enable.
+    pub vsiden: bool,
+    #[bits(20)]
+    _reserved2: u32,
+}
+
+/// SMMU_CR1: Queue/table access attributes.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Cr1 {
+    /// Queue inner cacheability.
+    #[bits(2)]
+    pub queue_ic: u8,
+    /// Queue outer cacheability.
+    #[bits(2)]
+    pub queue_oc: u8,
+    /// Queue shareability.
+    #[bits(2)]
+    pub queue_sh: u8,
+    /// Table inner cacheability.
+    #[bits(2)]
+    pub table_ic: u8,
+    /// Table outer cacheability.
+    #[bits(2)]
+    pub table_oc: u8,
+    /// Table shareability.
+    #[bits(2)]
+    pub table_sh: u8,
+    #[bits(20)]
+    _reserved: u32,
+}
+
+/// SMMU_CR2: Extended controls.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Cr2 {
+    /// Require private translation.
+    pub recinvsid: bool,
+    /// E2H enable.
+    pub e2h: bool,
+    /// PTM enable.
+    pub ptm: bool,
+    #[bits(29)]
+    _reserved: u32,
+}
+
+/// SMMU_GBPA: Global bypass attributes.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Gbpa {
+    #[bits(1)]
+    _reserved0: u32,
+    /// Abort all incoming transactions.
+    pub abort: bool,
+    #[bits(3)]
+    _reserved1: u32,
+    /// Instruction/data type override.
+    #[bits(2)]
+    pub instcfg: u8,
+    /// Privilege override.
+    #[bits(2)]
+    pub privcfg: u8,
+    #[bits(3)]
+    _reserved2: u32,
+    /// Shareability configuration.
+    #[bits(2)]
+    pub shcfg: u8,
+    /// Memory type config.
+    #[bits(4)]
+    pub alloccfg: u8,
+    #[bits(13)]
+    _reserved3: u32,
+    /// Update in progress (cleared by SMMU on completion).
+    pub update: bool,
+}
+
+/// SMMU_IRQ_CTRL: Interrupt enable control.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct IrqCtrl {
+    /// Global error IRQ enable.
+    pub gerror_irqen: bool,
+    /// PRI queue IRQ enable.
+    pub priq_irqen: bool,
+    /// Event queue IRQ enable.
+    pub eventq_irqen: bool,
+    #[bits(29)]
+    _reserved: u32,
+}
+
+/// SMMU_GERROR / SMMU_GERRORN: Global error status bits.
+///
+/// An error is active when `GERROR[bit] != GERRORN[bit]`. The SMMU toggles
+/// GERROR to signal; software toggles GERRORN to acknowledge.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct Gerror {
+    /// Command queue error.
+    pub cmdq_err: bool,
+    #[bits(1)]
+    _reserved0: u32,
+    /// Event queue access aborted.
+    pub eventq_abt_err: bool,
+    /// PRI queue access aborted.
+    pub priq_abt_err: bool,
+    /// CMD_SYNC MSI aborted.
+    pub msi_cmdq_abt_err: bool,
+    /// EVTQ MSI aborted.
+    pub msi_eventq_abt_err: bool,
+    /// PRIQ MSI aborted.
+    pub msi_priq_abt_err: bool,
+    /// GERROR MSI aborted.
+    pub msi_gerror_abt_err: bool,
+    /// Service failure mode.
+    pub sfm_err: bool,
+    #[bits(23)]
+    _reserved1: u32,
+}
+
+// =============================================================================
+// Bitfield Types — Queue Base Registers
+// =============================================================================
+
+/// SMMU_STRTAB_BASE: Stream table base address.
+#[bitfield(u64)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct StrtabBase {
+    #[bits(6)]
+    _reserved0: u64,
+    /// Physical address of the stream table, bits `[55:6]`.
+    #[bits(50)]
+    pub addr_bits: u64,
+    #[bits(6)]
+    _reserved1: u64,
+    /// Read-allocate hint.
+    pub ra: bool,
+    #[bits(1)]
+    _reserved2: u64,
+}
+
+impl StrtabBase {
+    /// Returns the physical address of the stream table.
+    pub fn addr(&self) -> u64 {
+        self.addr_bits() << 6
+    }
+}
+
+/// SMMU_STRTAB_BASE_CFG: Stream table configuration.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct StrtabBaseCfg {
+    /// Table size as log2(entries).
+    #[bits(6)]
+    pub log2size: u8,
+    /// Split point for 2-level tables (ignored for linear).
+    #[bits(5)]
+    pub split: u8,
+    #[bits(5)]
+    _reserved: u32,
+    /// Stream table format: 0=linear, 1=2-level.
+    #[bits(2)]
+    pub fmt: u8,
+    #[bits(14)]
+    _reserved2: u32,
+}
+
+open_enum! {
+    /// Stream table format values for `StrtabBaseCfg.fmt`.
+    pub enum StrtabFmt: u8 {
+        /// Linear stream table.
+        LINEAR = 0,
+        /// 2-level stream table.
+        TWO_LEVEL = 1,
+    }
+}
+
+/// SMMU_CMDQ_BASE / SMMU_EVENTQ_BASE: Queue base address.
+#[bitfield(u64)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct QueueBase {
+    /// Queue size as log2(entries).
+    #[bits(5)]
+    pub log2size: u8,
+    /// Physical address of queue memory, bits `[55:5]`.
+    #[bits(51)]
+    pub addr_bits: u64,
+    #[bits(6)]
+    _reserved: u64,
+    /// Read/write allocate hint.
+    pub ra_wa: bool,
+    #[bits(1)]
+    _reserved2: u64,
+}
+
+impl QueueBase {
+    /// Returns the physical address of the queue.
+    pub fn addr(&self) -> u64 {
+        self.addr_bits() << 5
+    }
+}
+
+/// SMMU_CMDQ_CONS: Command queue consumer index.
+///
+/// Has an error field in the upper bits that indicates the reason for a
+/// command queue error.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq, Inspect)]
+pub struct CmdqCons {
+    /// Read index with wrap bit (bits `[19:0]`).
+    #[bits(20)]
+    pub rd: u32,
+    #[bits(4)]
+    _reserved: u32,
+    /// Error code (valid when GERROR.CMDQ_ERR is active).
+    #[bits(7)]
+    pub err: u8,
+    #[bits(1)]
+    _reserved2: u32,
+}
+
+open_enum! {
+    /// Command queue error codes for `CmdqCons.err`.
+    pub enum CmdqError: u8 {
+        /// No error.
+        CERROR_NONE = 0,
+        /// Illegal command.
+        CERROR_ILL = 1,
+        /// Command queue abort.
+        CERROR_ABT = 2,
+        /// ATS error.
+        CERROR_ATS_ERR = 3,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_idr0_bitfield_roundtrip() {
+        let idr0 = Idr0::new()
+            .with_s1p(true)
+            .with_ttf(0b10)
+            .with_cohacc(true)
+            .with_asid16(true)
+            .with_msi(true)
+            .with_ttendian(0b10)
+            .with_stall_model(0b01)
+            .with_term_model(true);
+        assert!(idr0.s1p());
+        assert_eq!(idr0.ttf(), 0b10);
+        assert!(idr0.cohacc());
+        assert!(idr0.asid16());
+        assert!(idr0.msi());
+        assert_eq!(idr0.ttendian(), 0b10);
+        assert!(!idr0.s2p());
+        assert!(!idr0.ats());
+        assert!(!idr0.pri());
+    }
+
+    #[test]
+    fn test_idr0_recommended_value() {
+        // The recommended IDR0 value from the spec reference:
+        // S1P=1, TTF=0b10, COHACC=1, ASID16=1, MSI=1,
+        // TTENDIAN=0b10 (LE), STALL_MODEL=0b01, TERM_MODEL=1
+        // = 0x0E40_301E
+        let idr0 = Idr0::new()
+            .with_s1p(true)
+            .with_ttf(0b10)
+            .with_cohacc(true)
+            .with_asid16(true)
+            .with_msi(true)
+            .with_ttendian(0b10)
+            .with_stall_model(0b01)
+            .with_term_model(true);
+
+        // Verify individual bits
+        assert!(idr0.s1p());
+        assert!(idr0.msi());
+        assert!(idr0.cohacc());
+        assert!(idr0.term_model());
+    }
+
+    #[test]
+    fn test_idr1_bitfield_roundtrip() {
+        let idr1 = Idr1::new().with_sidsize(8).with_cmdqs(8).with_eventqs(8);
+        assert_eq!(idr1.sidsize(), 8);
+        assert_eq!(idr1.cmdqs(), 8);
+        assert_eq!(idr1.eventqs(), 8);
+        assert_eq!(idr1.ssidsize(), 0);
+        assert!(!idr1.tables_preset());
+        assert!(!idr1.queues_preset());
+    }
+
+    #[test]
+    fn test_idr5_bitfield_roundtrip() {
+        let idr5 = Idr5::new()
+            .with_oas(0b010)
+            .with_gran4k(true)
+            .with_gran64k(true);
+        assert_eq!(idr5.oas(), 0b010);
+        assert!(idr5.gran4k());
+        assert!(idr5.gran64k());
+        assert!(!idr5.gran16k());
+    }
+
+    #[test]
+    fn test_cr0_bitfield_roundtrip() {
+        let cr0 = Cr0::new()
+            .with_smmuen(true)
+            .with_cmdqen(true)
+            .with_eventqen(true);
+        assert!(cr0.smmuen());
+        assert!(cr0.cmdqen());
+        assert!(cr0.eventqen());
+        assert!(!cr0.priqen());
+    }
+
+    #[test]
+    fn test_cr0_enable_sequence() {
+        // Linux enables features one at a time:
+        // 1. CMDQEN
+        let cr0 = Cr0::new().with_cmdqen(true);
+        assert!(cr0.cmdqen());
+        assert!(!cr0.eventqen());
+        assert!(!cr0.smmuen());
+
+        // 2. CMDQEN + EVENTQEN
+        let cr0 = cr0.with_eventqen(true);
+        assert!(cr0.cmdqen());
+        assert!(cr0.eventqen());
+        assert!(!cr0.smmuen());
+
+        // 3. CMDQEN + EVENTQEN + SMMUEN
+        let cr0 = cr0.with_smmuen(true);
+        assert!(cr0.cmdqen());
+        assert!(cr0.eventqen());
+        assert!(cr0.smmuen());
+    }
+
+    #[test]
+    fn test_gbpa_update_bit() {
+        let gbpa = Gbpa::new().with_update(true).with_abort(true);
+        assert!(gbpa.update());
+        assert!(gbpa.abort());
+
+        // Simulate SMMU clearing the update bit
+        let gbpa = gbpa.with_update(false);
+        assert!(!gbpa.update());
+        assert!(gbpa.abort());
+    }
+
+    #[test]
+    fn test_irq_ctrl_roundtrip() {
+        let irq_ctrl = IrqCtrl::new()
+            .with_gerror_irqen(true)
+            .with_eventq_irqen(true);
+        assert!(irq_ctrl.gerror_irqen());
+        assert!(irq_ctrl.eventq_irqen());
+        assert!(!irq_ctrl.priq_irqen());
+    }
+
+    #[test]
+    fn test_gerror_toggle_protocol() {
+        let gerror = Gerror::new().with_cmdq_err(true);
+        let gerrorn = Gerror::new();
+
+        // Error is active when bits differ
+        assert_ne!(gerror.cmdq_err(), gerrorn.cmdq_err(),);
+
+        // Software acknowledges by matching
+        let gerrorn = gerrorn.with_cmdq_err(true);
+        assert_eq!(gerror.cmdq_err(), gerrorn.cmdq_err(),);
+    }
+
+    #[test]
+    fn test_strtab_base_address() {
+        // Address must be 64-byte aligned (bottom 6 bits zero)
+        let base = StrtabBase::new().with_addr_bits(0x1000_0000_u64 >> 6);
+        assert_eq!(base.addr(), 0x1000_0000);
+
+        let base = StrtabBase::new().with_addr_bits(0x0080_0000_0000_u64 >> 6);
+        assert_eq!(base.addr(), 0x0080_0000_0000);
+    }
+
+    #[test]
+    fn test_strtab_base_cfg_roundtrip() {
+        let cfg = StrtabBaseCfg::new()
+            .with_fmt(StrtabFmt::LINEAR.0)
+            .with_log2size(8);
+        assert_eq!(cfg.fmt(), StrtabFmt::LINEAR.0);
+        assert_eq!(cfg.log2size(), 8);
+    }
+
+    #[test]
+    fn test_queue_base_address() {
+        let base = QueueBase::new()
+            .with_addr_bits(0x2000_0000_u64 >> 5)
+            .with_log2size(8);
+        assert_eq!(base.addr(), 0x2000_0000);
+        assert_eq!(base.log2size(), 8);
+    }
+
+    #[test]
+    fn test_cmdq_cons_error() {
+        let cons = CmdqCons::new()
+            .with_rd(42)
+            .with_err(CmdqError::CERROR_ILL.0);
+        assert_eq!(cons.rd(), 42);
+        assert_eq!(cons.err(), CmdqError::CERROR_ILL.0);
+    }
+
+    #[test]
+    fn test_register_offsets() {
+        // Verify offsets match the spec
+        assert_eq!(IDR0, 0x0000);
+        assert_eq!(IDR1, 0x0004);
+        assert_eq!(IDR5, 0x0014);
+        assert_eq!(IIDR, 0x0018);
+        assert_eq!(AIDR, 0x001C);
+        assert_eq!(CR0, 0x0020);
+        assert_eq!(CR0ACK, 0x0024);
+        assert_eq!(CR1, 0x0028);
+        assert_eq!(CR2, 0x002C);
+        assert_eq!(GBPA, 0x0044);
+        assert_eq!(IRQ_CTRL, 0x0050);
+        assert_eq!(IRQ_CTRLACK, 0x0054);
+        assert_eq!(GERROR, 0x0060);
+        assert_eq!(GERRORN, 0x0064);
+        assert_eq!(GERROR_IRQ_CFG0, 0x0068);
+        assert_eq!(STRTAB_BASE, 0x0080);
+        assert_eq!(STRTAB_BASE_CFG, 0x0088);
+        assert_eq!(CMDQ_BASE, 0x0090);
+        assert_eq!(CMDQ_PROD, 0x0098);
+        assert_eq!(CMDQ_CONS, 0x009C);
+        assert_eq!(EVENTQ_BASE, 0x00A0);
+        assert_eq!(EVENTQ_IRQ_CFG0, 0x00B0);
+        assert_eq!(EVENTQ_PROD_PAGE1, 0x100A8);
+        assert_eq!(EVENTQ_CONS_PAGE1, 0x100AC);
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/spec/ste.rs b/vm/devices/iommu/smmu/src/spec/ste.rs
new file mode 100644
index 0000000000..0c40340bf8
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/spec/ste.rs
@@ -0,0 +1,309 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMUv3 Stream Table Entry (STE) definitions.
+//!
+//! Each STE is 64 bytes (512 bits). The STE describes how the SMMU processes
+//! transactions for a given stream (device).
+
+use bitfield_struct::bitfield;
+use open_enum::open_enum;
+use zerocopy::FromBytes;
+use zerocopy::Immutable;
+use zerocopy::IntoBytes;
+use zerocopy::KnownLayout;
+
+/// Stream table entry size in bytes.
+pub const STE_SIZE: usize = 64;
+
+/// Stream table entry (64 bytes).
+///
+/// Only the first two quadwords have defined fields for stage 1 translation.
+/// The remaining quadwords are used for stage 2 and other optional features.
+#[repr(C)]
+#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct Ste {
+    /// Quadword 0: Valid, Config, S1 context pointer.
+    pub qw0: SteDw0,
+    /// Quadword 1: Stage 1 attributes, stream world.
+    pub qw1: SteDw1,
+    /// Quadwords 2-7: Stage 2 fields (unused for S1-only).
+    pub _qw2_7: [u64; 6],
+}
+
+impl Ste {
+    /// Returns true if the STE is valid (V bit set).
+    pub fn valid(&self) -> bool {
+        self.qw0.v()
+    }
+
+    /// Returns the stream configuration.
+    pub fn config(&self) -> SteConfig {
+        SteConfig(self.qw0.config())
+    }
+
+    /// Returns the stage 1 context descriptor pointer (physical address).
+    ///
+    /// The pointer is stored in bits `[55:6]` of QW0, so the actual address
+    /// is the stored value shifted left by 6.
+    pub fn s1_context_ptr(&self) -> u64 {
+        self.qw0.s1_context_ptr() << 6
+    }
+
+    /// Returns the S1CDMax field (log2 of number of context descriptors).
+    pub fn s1_cd_max(&self) -> u8 {
+        self.qw0.s1_cd_max()
+    }
+
+    /// Returns the S1Fmt field (CD table format).
+    pub fn s1_fmt(&self) -> u8 {
+        self.qw0.s1_fmt()
+    }
+}
+
+/// STE QW0 (bits `[63:0]`): Valid, Config, S1 pointers.
+#[bitfield(u64)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct SteDw0 {
+    /// Valid bit.
+    pub v: bool,
+    /// Stream configuration.
+    #[bits(3)]
+    pub config: u8,
+    /// Stage 1 CD table format (0=linear, 1=2-level 4KB, 2=2-level 64KB).
+    #[bits(2)]
+    pub s1_fmt: u8,
+    /// Stage 1 context descriptor pointer, bits `[55:6]` (address >> 6).
+    #[bits(50)]
+    pub s1_context_ptr: u64,
+    #[bits(3)]
+    _reserved: u64,
+    /// Log2(number of CDs). 0 = single CD.
+    #[bits(5)]
+    pub s1_cd_max: u8,
+}
+
+/// STE QW1 (bits `[127:64]`): Stage 1 attributes, stream world, etc.
+#[bitfield(u64)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
+pub struct SteDw1 {
+    /// S1 default substream behavior.
+    #[bits(2)]
+    pub s1_dss: u8,
+    /// CD pointer inner cacheability.
+    #[bits(2)]
+    pub s1_cir: u8,
+    /// CD pointer outer cacheability.
+    #[bits(2)]
+    pub s1_cor: u8,
+    /// CD pointer shareability.
+    #[bits(2)]
+    pub s1_csh: u8,
+    #[bits(4)]
+    _reserved0: u64,
+    /// DRE (DPCM/stall related).
+    pub dre: bool,
+    /// Contiguous hint.
+    pub cont: bool,
+    #[bits(2)]
+    _reserved1: u64,
+    /// Memory type config / MemAttr / MEV.
+    #[bits(5)]
+    pub mem_attr_and_mev: u8,
+    #[bits(3)]
+    _reserved2: u64,
+    /// Allocation configuration.
+    #[bits(4)]
+    pub alloccfg: u8,
+    /// Shareability override.
+    #[bits(2)]
+    pub shcfg: u8,
+    /// NS configuration.
+    #[bits(2)]
+    pub nscfg: u8,
+    #[bits(3)]
+    _reserved3: u64,
+    /// Stream world.
+    #[bits(2)]
+    pub strw: u8,
+    /// Memory type config override.
+    pub mtcfg: bool,
+    /// Memory attribute (for bypass).
+    #[bits(4)]
+    pub mem_attr: u8,
+    /// Instruction/data override.
+    #[bits(2)]
+    pub instcfg: u8,
+    /// Privilege override.
+    #[bits(2)]
+    pub privcfg: u8,
+    /// Software reserved fields.
+    #[bits(4)]
+    pub sw_reserved: u8,
+    /// EATS (ATS behavior).
+    #[bits(3)]
+    pub eats: u8,
+    /// S2 VMID (ignored for S2 bypass).
+    #[bits(11)]
+    pub s2_vmid: u16,
+}
+
+open_enum! {
+    /// STE Config field values (bits `[3:1]` of DW0).
+    pub enum SteConfig: u8 {
+        /// Abort: all transactions are aborted.
+        ABORT = 0b000,
+        /// Bypass: S1 bypass, S2 bypass (identity mapping).
+        BYPASS = 0b100,
+        /// S1 Translate, S2 Bypass.
+        S1_TRANS = 0b101,
+        /// S1 Bypass, S2 Translate.
+        S2_TRANS = 0b110,
+        /// S1 Translate, S2 Translate.
+        S1S2_TRANS = 0b111,
+    }
+}
+
+open_enum! {
+    /// STE S1Fmt (CD table format) values.
+    pub enum S1Fmt: u8 {
+        /// Linear CD table.
+        LINEAR = 0b00,
+        /// 2-level CD table, 4KB L2.
+        TWO_LEVEL_4K = 0b01,
+        /// 2-level CD table, 64KB L2.
+        TWO_LEVEL_64K = 0b10,
+    }
+}
+
+open_enum! {
+    /// STE stream world values.
+    pub enum Strw: u8 {
+        /// Non-secure EL1.
+        NS_EL1 = 0b00,
+        /// Non-secure EL2.
+        NS_EL2 = 0b10,
+        /// EL2 with E2H.
+        EL2_E2H = 0b11,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ste_size() {
+        assert_eq!(size_of::<Ste>(), STE_SIZE);
+    }
+
+    #[test]
+    fn test_ste_valid() {
+        let ste = Ste {
+            qw0: SteDw0::new(),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        };
+        assert!(!ste.valid());
+
+        let ste = Ste {
+            qw0: SteDw0::new().with_v(true),
+            ..ste
+        };
+        assert!(ste.valid());
+    }
+
+    #[test]
+    fn test_ste_config_values() {
+        assert_eq!(SteConfig::ABORT.0, 0b000);
+        assert_eq!(SteConfig::BYPASS.0, 0b100);
+        assert_eq!(SteConfig::S1_TRANS.0, 0b101);
+        assert_eq!(SteConfig::S2_TRANS.0, 0b110);
+        assert_eq!(SteConfig::S1S2_TRANS.0, 0b111);
+    }
+
+    #[test]
+    fn test_ste_dw0_fields() {
+        let dw0 = SteDw0::new()
+            .with_v(true)
+            .with_config(SteConfig::S1_TRANS.0)
+            .with_s1_fmt(S1Fmt::LINEAR.0)
+            .with_s1_context_ptr(0x1000_0000_u64 >> 6)
+            .with_s1_cd_max(0);
+
+        assert!(dw0.v());
+        assert_eq!(dw0.config(), SteConfig::S1_TRANS.0);
+        assert_eq!(dw0.s1_fmt(), S1Fmt::LINEAR.0);
+        assert_eq!(dw0.s1_context_ptr() << 6, 0x1000_0000);
+        assert_eq!(dw0.s1_cd_max(), 0);
+    }
+
+    #[test]
+    fn test_ste_dw1_fields() {
+        let dw1 = SteDw1::new()
+            .with_s1_cir(0b01) // WB
+            .with_s1_cor(0b01) // WB
+            .with_s1_csh(0b11) // ISH
+            .with_strw(Strw::NS_EL1.0);
+
+        assert_eq!(dw1.s1_cir(), 0b01);
+        assert_eq!(dw1.s1_cor(), 0b01);
+        assert_eq!(dw1.s1_csh(), 0b11);
+        assert_eq!(dw1.strw(), Strw::NS_EL1.0);
+    }
+
+    #[test]
+    fn test_ste_bypass() {
+        let ste = Ste {
+            qw0: SteDw0::new().with_v(true).with_config(SteConfig::BYPASS.0),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        };
+
+        assert!(ste.valid());
+        assert_eq!(ste.config(), SteConfig::BYPASS);
+    }
+
+    #[test]
+    fn test_ste_s1_trans() {
+        let cd_addr: u64 = 0x8000_0000;
+        let ste = Ste {
+            qw0: SteDw0::new()
+                .with_v(true)
+                .with_config(SteConfig::S1_TRANS.0)
+                .with_s1_fmt(S1Fmt::LINEAR.0)
+                .with_s1_context_ptr(cd_addr >> 6)
+                .with_s1_cd_max(0),
+            qw1: SteDw1::new()
+                .with_s1_cir(0b01)
+                .with_s1_cor(0b01)
+                .with_s1_csh(0b11)
+                .with_strw(Strw::NS_EL1.0),
+            _qw2_7: [0; 6],
+        };
+
+        assert!(ste.valid());
+        assert_eq!(ste.config(), SteConfig::S1_TRANS);
+        assert_eq!(ste.s1_context_ptr(), cd_addr);
+        assert_eq!(ste.s1_cd_max(), 0);
+        assert_eq!(ste.s1_fmt(), S1Fmt::LINEAR.0);
+    }
+
+    #[test]
+    fn test_ste_invalid_returns_fault() {
+        let ste = Ste {
+            qw0: SteDw0::new(),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        };
+        assert!(!ste.valid());
+    }
+
+    #[test]
+    fn test_ste_context_ptr_alignment() {
+        // Context pointer is 64-byte aligned (bits [55:6])
+        let dw0 = SteDw0::new().with_s1_context_ptr(0xABCD_EF00_u64 >> 6);
+        // Reconstructed address should be 64-byte aligned
+        assert_eq!((dw0.s1_context_ptr() << 6) & 0x3F, 0);
+    }
+}
diff --git a/vm/devices/iommu/smmu/src/translate.rs b/vm/devices/iommu/smmu/src/translate.rs
new file mode 100644
index 0000000000..dcbfcbc701
--- /dev/null
+++ b/vm/devices/iommu/smmu/src/translate.rs
@@ -0,0 +1,1046 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMU translation logic: STE lookup, CD lookup, and translation context.
+//!
+//! This module handles the IOVA→GPA translation chain:
+//! 1. Look up the Stream Table Entry (STE) by stream ID.
+//! 2. Determine the translation action from STE.Config.
+//! 3. For S1 translation, look up the Context Descriptor (CD).
+//! 4. Extract the translation context (page table base, granule, etc.).
+
+use crate::spec::cd::Cd;
+use crate::spec::cd::Tg0;
+use crate::spec::events::EventId;
+use crate::spec::events::EvtEntry;
+use crate::spec::pt::ApBits;
+use crate::spec::pt::PtDesc;
+use crate::spec::ste::STE_SIZE;
+use crate::spec::ste::Ste;
+use crate::spec::ste::SteConfig;
+use guestmem::GuestMemory;
+
+/// Result of an STE config dispatch.
+#[derive(Debug, PartialEq, Eq)]
+pub enum SteAction {
+    /// Abort all transactions for this stream.
+    Abort,
+    /// Bypass translation (identity IOVA=GPA).
+    Bypass,
+    /// Stage 1 translation — proceed to CD lookup.
+    S1Translate,
+}
+
+/// Parameters for walking an AArch64 stage 1 page table, extracted from
+/// STE + CD.
+#[derive(Debug, Clone)]
+pub struct TranslationContext {
+    /// Page table base address (physical/GPA) from CD.TTB0.
+    pub ttb0: u64,
+    /// Input address size: VA range = 2^(64 - t0sz).
+    pub t0sz: u8,
+    /// Granule size (4K, 16K, or 64K).
+    pub tg0: Tg0,
+    /// Output address size in bits.
+    pub oas_bits: u8,
+    /// MAIR0 value (for attribute interpretation — not needed for address
+    /// translation yet, but will be used for TLB and memory attribute
+    /// emulation).
+    pub _mair0: u64,
+    /// ASID (for TLB tagging — will be used when a software TLB is added).
+    pub _asid: u16,
+}
+
+/// Error from STE/CD lookup.
+#[derive(Debug)]
+pub struct SmmuFault {
+    /// The event to write to the EVTQ.
+    pub event: EvtEntry,
+}
+
+impl SmmuFault {
+    fn bad_ste(sid: u32) -> Self {
+        SmmuFault {
+            event: EvtEntry::bad_ste(sid),
+        }
+    }
+
+    fn bad_streamid(sid: u32) -> Self {
+        SmmuFault {
+            event: EvtEntry {
+                header: crate::spec::events::EvtHeader::new()
+                    .with_event_id(EventId::C_BAD_STREAMID.0),
+                sid,
+                ..EvtEntry::new()
+            },
+        }
+    }
+
+    fn bad_cd(sid: u32) -> Self {
+        SmmuFault {
+            event: EvtEntry::bad_cd(sid),
+        }
+    }
+}
+
+/// Look up the STE for a given stream ID.
+///
+/// `strtab_base` is the physical base address of the linear stream table.
+/// `strtab_log2size` is the log2 of the number of entries.
+/// Returns the parsed STE or a fault event.
+pub fn lookup_ste(
+    gm: &GuestMemory,
+    strtab_base: u64,
+    strtab_log2size: u8,
+    sid: u32,
+) -> Result<Ste, SmmuFault> {
+    // Check stream ID is in range.
+    let max_sid = 1u64 << strtab_log2size;
+    if (sid as u64) >= max_sid {
+        return Err(SmmuFault::bad_streamid(sid));
+    }
+
+    let ste_addr = strtab_base + (sid as u64) * (STE_SIZE as u64);
+    let ste: Ste = gm
+        .read_plain(ste_addr)
+        .map_err(|_| SmmuFault::bad_ste(sid))?;
+
+    if !ste.valid() {
+        return Err(SmmuFault::bad_ste(sid));
+    }
+
+    Ok(ste)
+}
+
+/// Determine the translation action from an STE's Config field.
+pub fn ste_config_action(ste: &Ste) -> Result<SteAction, SteConfig> {
+    match ste.config() {
+        SteConfig::ABORT => Ok(SteAction::Abort),
+        SteConfig::BYPASS => Ok(SteAction::Bypass),
+        SteConfig::S1_TRANS => Ok(SteAction::S1Translate),
+        other => Err(other),
+    }
+}
+
+/// Look up the context descriptor for a given STE.
+///
+/// `ssid` is the sub-stream ID (0 for single-CD setups).
+/// Returns the parsed CD or a fault event.
+pub fn lookup_cd(gm: &GuestMemory, ste: &Ste, sid: u32, ssid: u32) -> Result<Cd, SmmuFault> {
+    let s1_context_ptr = ste.s1_context_ptr();
+    let s1_cd_max = ste.s1_cd_max();
+
+    // Validate SSID is within range.
+    if s1_cd_max > 0 {
+        let max_ssid = 1u32 << s1_cd_max;
+        if ssid >= max_ssid {
+            return Err(SmmuFault::bad_cd(sid));
+        }
+    } else if ssid != 0 {
+        return Err(SmmuFault::bad_cd(sid));
+    }
+
+    let cd_addr = s1_context_ptr + (ssid as u64) * (crate::spec::cd::CD_SIZE as u64);
+    let cd: Cd = gm.read_plain(cd_addr).map_err(|_| SmmuFault::bad_cd(sid))?;
+
+    if !cd.valid() {
+        return Err(SmmuFault::bad_cd(sid));
+    }
+
+    // Only AArch64 page tables are supported.
+    if !cd.aa64() {
+        return Err(SmmuFault::bad_cd(sid));
+    }
+
+    Ok(cd)
+}
+
+/// Extract the translation context from a parsed CD.
+///
+/// Returns `Err` with a `SmmuFault` if the CD contains unsupported or
+/// invalid configuration (e.g., unrecognized granule or IPS encoding).
+pub fn translation_context(cd: &Cd, sid: u32) -> Result<TranslationContext, SmmuFault> {
+    let tg0 = cd.tg0();
+    let ips = cd.ips();
+
+    // Validate granule.
+    if tg0.granule_size().is_none() {
+        return Err(SmmuFault::bad_cd(sid));
+    }
+
+    // Validate IPS.
+    let oas_bits = ips.bits().ok_or_else(|| SmmuFault::bad_cd(sid))?;
+
+    let t0sz = cd.t0sz();
+    if t0sz > 48 {
+        return Err(SmmuFault::bad_cd(sid));
+    }
+
+    // EPD0=1 means TTB0 walks are disabled — all accesses fault.
+    if cd.epd0() {
+        return Err(SmmuFault::bad_cd(sid));
+    }
+
+    Ok(TranslationContext {
+        ttb0: cd.ttb0(),
+        t0sz,
+        tg0,
+        oas_bits,
+        _mair0: cd.mair0,
+        _asid: cd.asid(),
+    })
+}
+
+/// Result of a successful page table walk.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Translation {
+    /// Translated guest physical address (with page offset applied).
+    pub gpa: u64,
+    /// Page size of the mapping (granule for pages, block size for blocks).
+    pub page_size: u64,
+}
+
+/// Compute the start level and number of VA bits for a given granule and T0SZ.
+///
+/// Returns `(start_level, va_bits)` where `va_bits = 64 - t0sz`.
+fn compute_start_level(tg0: Tg0, t0sz: u8) -> Option<(u8, u8)> {
+    let va_bits = 64u8.checked_sub(t0sz)?;
+    let bits_per_level = tg0.bits_per_level()?;
+    let page_shift = tg0.page_shift()?;
+
+    // Number of address bits resolved by the page table walk (excluding page
+    // offset). For 4K/9 bits per level: va_bits - 12 bits are resolved by
+    // the walk.
+    let resolve_bits = va_bits.checked_sub(page_shift)?;
+
+    // Number of full levels needed = ceil(resolve_bits / bits_per_level).
+    // Start level = 4 - num_levels (levels are numbered 0..3).
+    // num_levels == 0 means the VA space is exactly one page (no walk
+    // needed), which is an invalid configuration.
+    let num_levels = resolve_bits.div_ceil(bits_per_level);
+    if num_levels == 0 || num_levels > 4 {
+        return None;
+    }
+    let start_level = 4 - num_levels;
+
+    Some((start_level, va_bits))
+}
+
+/// Walk AArch64 stage 1 translation tables to translate an IOVA to a GPA.
+///
+/// `gm` is the guest memory (for reading page table entries from guest RAM).
+/// `ctx` holds the page table root and configuration (from STE+CD).
+/// `iova` is the input virtual address to translate.
+/// `write` is true for write accesses (for permission checking).
+/// `sid` is the stream ID (for fault event construction).
+///
+/// Returns the translated GPA and page size, or an `SmmuFault` with the
+/// event to report.
+pub fn walk_s1(
+    gm: &GuestMemory,
+    ctx: &TranslationContext,
+    iova: u64,
+    write: bool,
+    sid: u32,
+) -> Result<Translation, SmmuFault> {
+    let tg0 = ctx.tg0;
+    let page_shift = tg0.page_shift().ok_or_else(|| SmmuFault {
+        event: EvtEntry::translation_fault(sid, iova, write),
+    })?;
+    let bits_per_level = tg0.bits_per_level().ok_or_else(|| SmmuFault {
+        event: EvtEntry::translation_fault(sid, iova, write),
+    })?;
+    let page_size = 1u64 << page_shift;
+
+    let (start_level, va_bits) = compute_start_level(tg0, ctx.t0sz).ok_or_else(|| SmmuFault {
+        event: EvtEntry::translation_fault(sid, iova, write),
+    })?;
+
+    // Check IOVA is within the valid range (2^va_bits).
+    let va_mask = if va_bits >= 64 {
+        u64::MAX
+    } else {
+        (1u64 << va_bits) - 1
+    };
+    if iova > va_mask {
+        return Err(SmmuFault {
+            event: EvtEntry::translation_fault(sid, iova, write),
+        });
+    }
+
+    let oas_mask = if ctx.oas_bits >= 64 {
+        u64::MAX
+    } else {
+        (1u64 << ctx.oas_bits) - 1
+    };
+
+    let mut table_addr = ctx.ttb0;
+    let mut level = start_level;
+
+    loop {
+        // Compute the index at this level.
+        // For level `l` with 4K granule (9 bits/level, 12-bit page offset):
+        //   Level 0: bits [47:39] (9 bits)
+        //   Level 1: bits [38:30] (9 bits)
+        //   Level 2: bits [29:21] (9 bits)
+        //   Level 3: bits [20:12] (9 bits)
+        // General formula: shift = page_shift + (3 - level) * bits_per_level
+        let shift = page_shift as u32 + (3 - level as u32) * bits_per_level as u32;
+        let index_mask = (1u64 << bits_per_level) - 1;
+
+        // For the start level, the number of index bits may be smaller than
+        // bits_per_level when va_bits is not a multiple of bits_per_level.
+        let index = (iova >> shift) & index_mask;
+
+        let desc_addr = table_addr + index * 8;
+        let desc: PtDesc = gm.read_plain(desc_addr).map_err(|_| SmmuFault {
+            event: EvtEntry::translation_fault(sid, iova, write),
+        })?;
+
+        if !desc.is_valid() {
+            return Err(SmmuFault {
+                event: EvtEntry::translation_fault(sid, iova, write),
+            });
+        }
+
+        if level == 3 {
+            // At level 3, type=1 means page, type=0 is reserved (fault).
+            if !desc.desc_type() {
+                return Err(SmmuFault {
+                    event: EvtEntry::translation_fault(sid, iova, write),
+                });
+            }
+            // Page descriptor at L3.
+            check_permissions(&desc, iova, write, sid)?;
+            let output_addr = output_address(&desc, tg0, level);
+            if output_addr > oas_mask {
+                return Err(SmmuFault {
+                    event: EvtEntry::addr_size_fault(sid, iova, write),
+                });
+            }
+            let page_offset = iova & (page_size - 1);
+            return Ok(Translation {
+                gpa: output_addr | page_offset,
+                page_size,
+            });
+        }
+
+        if desc.is_block() {
+            // Block descriptor at level 1 or 2.
+            check_permissions(&desc, iova, write, sid)?;
+            let block_size = 1u64 << shift;
+            let output_addr = output_address(&desc, tg0, level);
+            if output_addr > oas_mask {
+                return Err(SmmuFault {
+                    event: EvtEntry::addr_size_fault(sid, iova, write),
+                });
+            }
+            let block_offset = iova & (block_size - 1);
+            return Ok(Translation {
+                gpa: output_addr | block_offset,
+                page_size: block_size,
+            });
+        }
+
+        // Table descriptor — descend to next level.
+        table_addr = desc.next_table_addr();
+        level += 1;
+
+        if level > 3 {
+            // Should not happen with well-formed page tables.
+            return Err(SmmuFault {
+                event: EvtEntry::translation_fault(sid, iova, write),
+            });
+        }
+    }
+}
+
+/// Check access permissions and access flag on a leaf descriptor.
+fn check_permissions(desc: &PtDesc, iova: u64, write: bool, sid: u32) -> Result<(), SmmuFault> {
+    // Check access flag.
+    if !desc.af() {
+        return Err(SmmuFault {
+            event: EvtEntry::access_fault(sid, iova, write),
+        });
+    }
+
+    // Check write permission.
+    if write {
+        let ap = ApBits(desc.ap());
+        if !ap.allows_write() {
+            return Err(SmmuFault {
+                event: EvtEntry::permission_fault(sid, iova, write),
+            });
+        }
+    }
+
+    Ok(())
+}
+
+/// Extract the output address from a leaf descriptor, masking to the
+/// appropriate alignment for the given level and granule.
+fn output_address(desc: &PtDesc, tg0: Tg0, level: u8) -> u64 {
+    match tg0 {
+        Tg0::GRAN_4K => desc.output_address_4k(level),
+        Tg0::GRAN_16K => desc.output_address_16k(level),
+        Tg0::GRAN_64K => desc.output_address_64k(level),
+        _ => desc.output_address_4k(level), // fallback, shouldn't happen
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::spec::cd::CD_SIZE;
+    use crate::spec::cd::CdDw0;
+    use crate::spec::cd::CdDw1;
+    use crate::spec::cd::Ips;
+    use crate::spec::ste::SteDw0;
+    use crate::spec::ste::SteDw1;
+
+    const STRTAB_BASE: u64 = 0x10_0000;
+    const CD_BASE: u64 = 0x20_0000;
+    const STRTAB_LOG2SIZE: u8 = 10; // 1024 entries
+
+    /// Build a valid STE for S1 translation pointing to a CD table.
+    fn make_s1_ste(cd_base: u64) -> Ste {
+        Ste {
+            qw0: SteDw0::new()
+                .with_v(true)
+                .with_config(SteConfig::S1_TRANS.0)
+                .with_s1_context_ptr(cd_base >> 6)
+                .with_s1_cd_max(0), // single CD
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        }
+    }
+
+    /// Build a valid STE for bypass.
+    fn make_bypass_ste() -> Ste {
+        Ste {
+            qw0: SteDw0::new().with_v(true).with_config(SteConfig::BYPASS.0),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        }
+    }
+
+    /// Build a valid STE for abort.
+    fn make_abort_ste() -> Ste {
+        Ste {
+            qw0: SteDw0::new().with_v(true).with_config(SteConfig::ABORT.0),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        }
+    }
+
+    /// Build a valid CD.
+    fn make_cd(ttb0: u64, t0sz: u8, tg0: Tg0, ips: Ips) -> Cd {
+        Cd {
+            qw0: CdDw0::new()
+                .with_v(true)
+                .with_t0sz(t0sz)
+                .with_tg0(tg0.0)
+                .with_ips(ips.0)
+                .with_aa64(true)
+                .with_asid(1),
+            qw1: CdDw1::new().with_ttb0(ttb0 >> 4),
+            _qw2: 0,
+            mair0: 0xFF440C0400,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        }
+    }
+
+    /// Write an STE to guest memory at the given stream ID.
+    fn write_ste(gm: &GuestMemory, sid: u32, ste: &Ste) {
+        let addr = STRTAB_BASE + (sid as u64) * (STE_SIZE as u64);
+        gm.write_plain(addr, ste).expect("write STE");
+    }
+
+    /// Write a CD to guest memory at the given SSID offset from cd_base.
+    fn write_cd(gm: &GuestMemory, cd_base: u64, ssid: u32, cd: &Cd) {
+        let addr = cd_base + (ssid as u64) * (CD_SIZE as u64);
+        gm.write_plain(addr, cd).expect("write CD");
+    }
+
+    // =========================================================================
+    // STE lookup tests
+    // =========================================================================
+
+    #[test]
+    fn test_ste_lookup_valid() {
+        let gm = GuestMemory::allocate(0x40_0000);
+        let ste = make_s1_ste(CD_BASE);
+        write_ste(&gm, 5, &ste);
+
+        let result = lookup_ste(&gm, STRTAB_BASE, STRTAB_LOG2SIZE, 5);
+        let found = result.expect("STE lookup should succeed");
+        assert!(found.valid());
+        assert_eq!(found.config(), SteConfig::S1_TRANS);
+        assert_eq!(found.s1_context_ptr(), CD_BASE);
+    }
+
+    #[test]
+    fn test_ste_lookup_invalid_v0() {
+        let gm = GuestMemory::allocate(0x40_0000);
+        // Write an STE with V=0.
+        let ste = Ste {
+            qw0: SteDw0::new().with_v(false),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        };
+        write_ste(&gm, 3, &ste);
+
+        let result = lookup_ste(&gm, STRTAB_BASE, STRTAB_LOG2SIZE, 3);
+        let fault = result.expect_err("Should fault on V=0");
+        assert_eq!(fault.event.event_id(), EventId::C_BAD_STE);
+        assert_eq!(fault.event.sid, 3);
+    }
+
+    #[test]
+    fn test_ste_lookup_out_of_range() {
+        let gm = GuestMemory::allocate(0x40_0000);
+        // Stream ID 2048 is out of range for log2size=10 (max 1024).
+        let result = lookup_ste(&gm, STRTAB_BASE, STRTAB_LOG2SIZE, 2048);
+        let fault = result.expect_err("Should fault on out-of-range SID");
+        assert_eq!(fault.event.event_id(), EventId::C_BAD_STREAMID);
+    }
+
+    // =========================================================================
+    // STE config dispatch tests
+    // =========================================================================
+
+    #[test]
+    fn test_ste_config_abort() {
+        let ste = make_abort_ste();
+        assert_eq!(ste_config_action(&ste), Ok(SteAction::Abort));
+    }
+
+    #[test]
+    fn test_ste_config_bypass() {
+        let ste = make_bypass_ste();
+        assert_eq!(ste_config_action(&ste), Ok(SteAction::Bypass));
+    }
+
+    #[test]
+    fn test_ste_config_s1_trans() {
+        let ste = make_s1_ste(CD_BASE);
+        assert_eq!(ste_config_action(&ste), Ok(SteAction::S1Translate));
+    }
+
+    #[test]
+    fn test_ste_config_unknown() {
+        // Config = 0b010 is not a valid configuration.
+        let ste = Ste {
+            qw0: SteDw0::new().with_v(true).with_config(0b010),
+            qw1: SteDw1::new(),
+            _qw2_7: [0; 6],
+        };
+        assert!(ste_config_action(&ste).is_err());
+    }
+
+    // =========================================================================
+    // CD lookup tests
+    // =========================================================================
+
+    #[test]
+    fn test_cd_lookup_valid() {
+        let gm = GuestMemory::allocate(0x40_0000);
+        let ste = make_s1_ste(CD_BASE);
+        let cd = make_cd(0x3000_0000, 32, Tg0::GRAN_4K, Ips::IPS_40);
+        write_cd(&gm, CD_BASE, 0, &cd);
+
+        let result = lookup_cd(&gm, &ste, 5, 0);
+        let found = result.expect("CD lookup should succeed");
+        assert!(found.valid());
+        assert!(found.aa64());
+        assert_eq!(found.ttb0(), 0x3000_0000);
+        assert_eq!(found.t0sz(), 32);
+    }
+
+    #[test]
+    fn test_cd_lookup_invalid_v0() {
+        let gm = GuestMemory::allocate(0x40_0000);
+        let ste = make_s1_ste(CD_BASE);
+        // Write a CD with V=0.
+        let cd = Cd {
+            qw0: CdDw0::new().with_v(false),
+            qw1: CdDw1::new(),
+            _qw2: 0,
+            mair0: 0,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        write_cd(&gm, CD_BASE, 0, &cd);
+
+        let result = lookup_cd(&gm, &ste, 5, 0);
+        let fault = result.expect_err("Should fault on V=0 CD");
+        assert_eq!(fault.event.event_id(), EventId::C_BAD_CD);
+    }
+
+    #[test]
+    fn test_cd_lookup_not_aa64() {
+        let gm = GuestMemory::allocate(0x40_0000);
+        let ste = make_s1_ste(CD_BASE);
+        // Write a CD with AA64=0 (AArch32 — not supported).
+        let cd = Cd {
+            qw0: CdDw0::new().with_v(true).with_aa64(false),
+            qw1: CdDw1::new(),
+            _qw2: 0,
+            mair0: 0,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        write_cd(&gm, CD_BASE, 0, &cd);
+
+        let result = lookup_cd(&gm, &ste, 5, 0);
+        let fault = result.expect_err("Should fault on non-AA64 CD");
+        assert_eq!(fault.event.event_id(), EventId::C_BAD_CD);
+    }
+
+    // =========================================================================
+    // Translation context tests
+    // =========================================================================
+
+    #[test]
+    fn test_translation_context_4k() {
+        let cd = make_cd(0x4000_0000, 32, Tg0::GRAN_4K, Ips::IPS_40);
+        let ctx = translation_context(&cd, 0).expect("should succeed");
+        assert_eq!(ctx.ttb0, 0x4000_0000);
+        assert_eq!(ctx.t0sz, 32);
+        assert_eq!(ctx.tg0, Tg0::GRAN_4K);
+        assert_eq!(ctx.oas_bits, 40);
+        assert_eq!(ctx._asid, 1);
+    }
+
+    #[test]
+    fn test_translation_context_16k() {
+        let cd = make_cd(0x8000_0000, 28, Tg0::GRAN_16K, Ips::IPS_48);
+        let ctx = translation_context(&cd, 0).expect("should succeed");
+        assert_eq!(ctx.tg0, Tg0::GRAN_16K);
+        assert_eq!(ctx.oas_bits, 48);
+        assert_eq!(ctx.t0sz, 28);
+    }
+
+    #[test]
+    fn test_translation_context_bad_granule() {
+        // TG0 = 0b11 is reserved/invalid.
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_v(true)
+                .with_t0sz(32)
+                .with_tg0(0b11) // invalid
+                .with_ips(Ips::IPS_40.0)
+                .with_aa64(true),
+            qw1: CdDw1::new(),
+            _qw2: 0,
+            mair0: 0,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        let result = translation_context(&cd, 0);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_translation_context_bad_ips() {
+        // IPS = 0b111 is reserved/invalid.
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_v(true)
+                .with_t0sz(32)
+                .with_tg0(Tg0::GRAN_4K.0)
+                .with_ips(0b111) // invalid
+                .with_aa64(true),
+            qw1: CdDw1::new(),
+            _qw2: 0,
+            mair0: 0,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        let result = translation_context(&cd, 0);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_translation_context_epd0() {
+        // EPD0=1 disables TTB0 walks.
+        let cd = Cd {
+            qw0: CdDw0::new()
+                .with_v(true)
+                .with_t0sz(32)
+                .with_tg0(Tg0::GRAN_4K.0)
+                .with_ips(Ips::IPS_40.0)
+                .with_aa64(true)
+                .with_epd0(true),
+            qw1: CdDw1::new(),
+            _qw2: 0,
+            mair0: 0,
+            mair1: 0,
+            _qw5_7: [0; 3],
+        };
+        let result = translation_context(&cd, 0);
+        assert!(result.is_err());
+    }
+
+    // =========================================================================
+    // Page table walker tests
+    // =========================================================================
+
+    // Page table memory layout constants.
+    const PT_L0_BASE: u64 = 0x30_0000; // L0 table
+    const PT_L1_BASE: u64 = 0x30_1000; // L1 table
+    const PT_L2_BASE: u64 = 0x30_2000; // L2 table
+    const PT_L3_BASE: u64 = 0x30_3000; // L3 table
+    const DATA_GPA: u64 = 0x4000_0000; // Target GPA for mappings
+
+    /// Build a TranslationContext for 4K granule, T0SZ=32 (32-bit VA), 40-bit OAS.
+    fn make_4k_ctx(ttb0: u64) -> TranslationContext {
+        TranslationContext {
+            ttb0,
+            t0sz: 32,
+            tg0: Tg0::GRAN_4K,
+            oas_bits: 40,
+            _mair0: 0xFF440C0400,
+            _asid: 1,
+        }
+    }
+
+    /// Write a page table descriptor at the given address.
+    fn write_pt_desc(gm: &GuestMemory, addr: u64, desc: u64) {
+        gm.write_plain(addr, &desc).expect("write PT desc");
+    }
+
+    /// Build a table descriptor pointing to the given next-level table address.
+    fn table_desc(next_table: u64) -> u64 {
+        // Valid=1, Type=1 (table), address in bits [47:12].
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true) // table
+            .with_addr_bits(next_table >> 12);
+        desc.into()
+    }
+
+    /// Build a block descriptor for a given output address with RW, AF set.
+    fn block_desc(output_addr: u64) -> u64 {
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(false) // block
+            .with_af(true)
+            .with_ap(ApBits::RW_EL1.0)
+            .with_addr_bits(output_addr >> 12);
+        desc.into()
+    }
+
+    /// Build a page descriptor (L3) for a given output address with RW, AF set.
+    fn page_desc(output_addr: u64) -> u64 {
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true) // page at L3
+            .with_af(true)
+            .with_ap(ApBits::RW_EL1.0)
+            .with_addr_bits(output_addr >> 12);
+        desc.into()
+    }
+
+    /// Build a read-only page descriptor.
+    fn ro_page_desc(output_addr: u64) -> u64 {
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_af(true)
+            .with_ap(ApBits::RO_EL1.0)
+            .with_addr_bits(output_addr >> 12);
+        desc.into()
+    }
+
+    /// Build a page descriptor with AF=0 (access flag not set).
+    fn no_af_page_desc(output_addr: u64) -> u64 {
+        let desc = PtDesc::new()
+            .with_valid(true)
+            .with_desc_type(true)
+            .with_af(false)
+            .with_ap(ApBits::RW_EL1.0)
+            .with_addr_bits(output_addr >> 12);
+        desc.into()
+    }
+
+    #[test]
+    fn test_walk_4k_single_level_block() {
+        // T0SZ=32 with 4K granule: 32-bit VA space.
+        // Walk starts at level 1 (levels 1, 2, 3).
+        // Map a 1GB block at level 1 entry 0 → DATA_GPA.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // Level 1 entry 0: 1GB block → DATA_GPA.
+        write_pt_desc(&gm, PT_L1_BASE, block_desc(DATA_GPA));
+
+        let result = walk_s1(&gm, &ctx, 0, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, DATA_GPA);
+        assert_eq!(tr.page_size, 1 << 30); // 1GB block
+    }
+
+    #[test]
+    fn test_walk_4k_four_levels() {
+        // T0SZ=16 with 4K granule: 48-bit VA space, 4 levels (0-3).
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = TranslationContext {
+            ttb0: PT_L0_BASE,
+            t0sz: 16,
+            tg0: Tg0::GRAN_4K,
+            oas_bits: 48,
+            _mair0: 0,
+            _asid: 0,
+        };
+
+        // L0[0] → L1 table
+        write_pt_desc(&gm, PT_L0_BASE, table_desc(PT_L1_BASE));
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3 table
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[0] → page at DATA_GPA
+        write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA));
+
+        let result = walk_s1(&gm, &ctx, 0, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, DATA_GPA);
+        assert_eq!(tr.page_size, 4096);
+    }
+
+    #[test]
+    fn test_walk_4k_2mb_block() {
+        // T0SZ=32, 4K granule. Level 2 block descriptor (2MB).
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → 2MB block at DATA_GPA
+        write_pt_desc(&gm, PT_L2_BASE, block_desc(DATA_GPA));
+
+        let result = walk_s1(&gm, &ctx, 0, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, DATA_GPA);
+        assert_eq!(tr.page_size, 2 << 20); // 2MB
+    }
+
+    #[test]
+    fn test_walk_4k_page_with_offset() {
+        // Walk to a 4K page and verify the intra-page offset is preserved.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3 table
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[0] → page at DATA_GPA
+        write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA));
+
+        // Access IOVA 0x0000_0100 — should map to DATA_GPA + 0x100.
+        let result = walk_s1(&gm, &ctx, 0x100, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, DATA_GPA + 0x100);
+        assert_eq!(tr.page_size, 4096);
+    }
+
+    #[test]
+    fn test_walk_4k_block_with_offset() {
+        // Walk to a 2MB block and verify the intra-block offset is preserved.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → 2MB block at DATA_GPA
+        write_pt_desc(&gm, PT_L2_BASE, block_desc(DATA_GPA));
+
+        // Access IOVA 0x0001_2345 — should map to DATA_GPA + 0x0001_2345.
+        let result = walk_s1(&gm, &ctx, 0x0001_2345, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, DATA_GPA + 0x0001_2345);
+        assert_eq!(tr.page_size, 2 << 20);
+    }
+
+    #[test]
+    fn test_walk_fault_unmapped() {
+        // Walk with a PTE that has Valid=0.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] is all zeros (invalid).
+        let result = walk_s1(&gm, &ctx, 0, false, 42);
+        let fault = result.expect_err("should fault");
+        assert_eq!(fault.event.event_id(), EventId::F_TRANSLATION);
+        assert_eq!(fault.event.sid, 42);
+    }
+
+    #[test]
+    fn test_walk_fault_permission() {
+        // Write to a read-only page.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3 table
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[0] → read-only page
+        write_pt_desc(&gm, PT_L3_BASE, ro_page_desc(DATA_GPA));
+
+        // Read should succeed.
+        let result = walk_s1(&gm, &ctx, 0, false, 0);
+        assert!(result.is_ok());
+
+        // Write should fault.
+        let result = walk_s1(&gm, &ctx, 0, true, 0);
+        let fault = result.expect_err("should fault on write to RO");
+        assert_eq!(fault.event.event_id(), EventId::F_PERMISSION);
+    }
+
+    #[test]
+    fn test_walk_fault_access_flag() {
+        // Page with AF=0 — should produce F_ACCESS fault.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3 table
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[0] → page with AF=0
+        write_pt_desc(&gm, PT_L3_BASE, no_af_page_desc(DATA_GPA));
+
+        let result = walk_s1(&gm, &ctx, 0, false, 0);
+        let fault = result.expect_err("should fault on AF=0");
+        assert_eq!(fault.event.event_id(), EventId::F_ACCESS);
+    }
+
+    #[test]
+    fn test_walk_fault_addr_size() {
+        // Output address exceeds OAS.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        // 32-bit OAS — output addresses must fit in 32 bits.
+        let ctx = TranslationContext {
+            ttb0: PT_L1_BASE,
+            t0sz: 32,
+            tg0: Tg0::GRAN_4K,
+            oas_bits: 32,
+            _mair0: 0,
+            _asid: 0,
+        };
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3 table
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[0] → page at a high address (exceeds 32-bit OAS)
+        let high_addr = 0x2_0000_0000u64; // 8GB, exceeds 32-bit
+        write_pt_desc(&gm, PT_L3_BASE, page_desc(high_addr));
+
+        let result = walk_s1(&gm, &ctx, 0, false, 0);
+        let fault = result.expect_err("should fault on addr size");
+        assert_eq!(fault.event.event_id(), EventId::F_ADDR_SIZE);
+    }
+
+    #[test]
+    fn test_walk_iova_out_of_range() {
+        // IOVA exceeds the VA range defined by T0SZ.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE); // T0SZ=32, VA range = 2^32
+
+        // IOVA = 0x1_0000_0000 (exceeds 32-bit range).
+        let result = walk_s1(&gm, &ctx, 0x1_0000_0000, false, 0);
+        let fault = result.expect_err("should fault on out-of-range IOVA");
+        assert_eq!(fault.event.event_id(), EventId::F_TRANSLATION);
+    }
+
+    #[test]
+    fn test_walk_nonzero_l1_index() {
+        // Verify that non-zero L1 indices work correctly.
+        // T0SZ=32, 4K: L1 has 4 entries (indices 0-3, each covering 1GB).
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[2] → 1GB block at DATA_GPA (IOVA starting at 2GB).
+        let l1_entry2_addr = PT_L1_BASE + 2 * 8;
+        write_pt_desc(&gm, l1_entry2_addr, block_desc(DATA_GPA));
+
+        // IOVA = 0x8000_0000 (2GB) should use L1 index 2.
+        let result = walk_s1(&gm, &ctx, 0x8000_0000, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, DATA_GPA);
+        assert_eq!(tr.page_size, 1 << 30);
+    }
+
+    #[test]
+    fn test_walk_nonzero_l3_index() {
+        // Verify non-zero L3 index with 4K pages.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        // L1[0] → L2 table
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        // L2[0] → L3 table
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        // L3[5] → page at DATA_GPA + 0x5000
+        let target = DATA_GPA + 0x5000;
+        write_pt_desc(&gm, PT_L3_BASE + 5 * 8, page_desc(target));
+
+        // IOVA = 0x5000 (L3 index 5) + offset 0x42.
+        let result = walk_s1(&gm, &ctx, 0x5042, false, 0);
+        let tr = result.expect("should translate");
+        assert_eq!(tr.gpa, target + 0x42);
+        assert_eq!(tr.page_size, 4096);
+    }
+
+    #[test]
+    fn test_walk_write_to_rw_page() {
+        // Write to a RW page should succeed.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = make_4k_ctx(PT_L1_BASE);
+
+        write_pt_desc(&gm, PT_L1_BASE, table_desc(PT_L2_BASE));
+        write_pt_desc(&gm, PT_L2_BASE, table_desc(PT_L3_BASE));
+        write_pt_desc(&gm, PT_L3_BASE, page_desc(DATA_GPA));
+
+        let result = walk_s1(&gm, &ctx, 0, true, 0);
+        let tr = result.expect("write to RW page should succeed");
+        assert_eq!(tr.gpa, DATA_GPA);
+    }
+
+    #[test]
+    fn test_compute_start_level_4k() {
+        // T0SZ=32, 4K: VA bits=32, resolve=20, levels=ceil(20/9)=3, start=1
+        assert_eq!(compute_start_level(Tg0::GRAN_4K, 32), Some((1, 32)));
+        // T0SZ=16, 4K: VA bits=48, resolve=36, levels=4, start=0
+        assert_eq!(compute_start_level(Tg0::GRAN_4K, 16), Some((0, 48)));
+        // T0SZ=25, 4K: VA bits=39, resolve=27, levels=3, start=1
+        assert_eq!(compute_start_level(Tg0::GRAN_4K, 25), Some((1, 39)));
+    }
+
+    #[test]
+    fn test_walk_degenerate_t0sz_returns_fault() {
+        // 64KB granule with T0SZ=48 produces resolve_bits=0. Without
+        // the guard in compute_start_level, walk_s1 would compute
+        // start_level=4 and then evaluate (3u32 - 4u32), panicking
+        // in debug mode. Verify it returns a translation fault instead.
+        let gm = GuestMemory::allocate(0x5000_0000);
+        let ctx = TranslationContext {
+            ttb0: PT_L1_BASE,
+            t0sz: 48,
+            tg0: Tg0::GRAN_64K,
+            oas_bits: 40,
+            _mair0: 0,
+            _asid: 0,
+        };
+
+        let result = walk_s1(&gm, &ctx, 0, false, 99);
+        let fault = result.expect_err("degenerate T0SZ must fault, not panic");
+        assert_eq!(fault.event.event_id(), EventId::F_TRANSLATION);
+        assert_eq!(fault.event.sid, 99);
+    }
+}
diff --git a/vm/devices/pci/pci_core/src/bus_range.rs b/vm/devices/pci/pci_core/src/bus_range.rs
index 3dcc1c2f92..1412992253 100644
--- a/vm/devices/pci/pci_core/src/bus_range.rs
+++ b/vm/devices/pci/pci_core/src/bus_range.rs
@@ -9,10 +9,8 @@
 //! [`ConfigSpaceType1Emulator`](crate::cfg_space_emu::ConfigSpaceType1Emulator)
 //! when the guest writes bus number registers, and on restore/reset.
 //!
-//! Consumers (ITS wrappers, SMMU) compose a full device identity from the
-//! bus range plus the device's BDF. The segment number is not included
-//! here — it is a static property of the root complex and is held
-//! separately by the consumer.
+//! Consumers (ITS wrappers, SMMU) read the bus range to compose a full
+//! device identity from the bus range plus the device's BDF.
 
 use std::sync::Arc;
 use std::sync::atomic::AtomicU16;
diff --git a/vmm_core/src/acpi_builder.rs b/vmm_core/src/acpi_builder.rs
index 49503f2f4e..98e622a372 100644
--- a/vmm_core/src/acpi_builder.rs
+++ b/vmm_core/src/acpi_builder.rs
@@ -22,6 +22,25 @@ use vm_topology::processor::x86::X86Topology;
 use x86defs::apic::APIC_BASE_ADDRESS;
 use zerocopy::IntoBytes;
 
+/// Configuration for the SMMUv3 ACPI IORT node.
+#[derive(Debug, Clone)]
+pub struct AcpiSmmuConfig {
+    /// Index of the root complex this SMMU covers (matches
+    /// `PcieHostBridge.index`). Used to route each RC's IORT ID mapping
+    /// to its specific SMMU node.
+    pub rc_index: u32,
+    /// PCIe segment number of the root complex this SMMU covers. Used as
+    /// the output_base in the SMMU→ITS ID mapping to produce globally
+    /// unique ITS device IDs: `(segment << 16) | BDF`.
+    pub segment: u16,
+    /// MMIO base address of the SMMU.
+    pub base: u64,
+    /// GIC SPI INTID for the event queue interrupt.
+    pub event_gsiv: u32,
+    /// GIC SPI INTID for the global error interrupt.
+    pub gerr_gsiv: u32,
+}
+
 /// Binary ACPI tables constructed by [`AcpiTablesBuilder`].
 pub struct BuiltAcpiTables {
     /// The RDSP. Assumed to be given a whole page.
@@ -75,6 +94,9 @@ pub enum AcpiArchConfig {
         hypervisor_vendor_identity: u64,
         /// Virtual timer PPI (GIC INTID).
         virt_timer_ppi: u32,
+        /// SMMUv3 instances. Each entry adds an SMMUv3 IORT node for the
+        /// specified PCI segment. Empty means no SMMU.
+        smmu: Vec<AcpiSmmuConfig>,
     },
 }
 
@@ -365,13 +387,20 @@ impl<T: AcpiTopology> AcpiTablesBuilder<'_, T> {
 
         let its_id = T::iort_its_id(self.processor_topology);
         let has_its = its_id.is_some();
+        let smmu_configs: &[AcpiSmmuConfig] = match &self.arch {
+            AcpiArchConfig::Aarch64 { smmu, .. } => smmu.as_slice(),
+            _ => &[],
+        };
+        let has_smmu = !smmu_configs.is_empty();
         let its_node_count: u32 = if has_its { 1 } else { 0 };
-        let node_count = its_node_count + self.pcie_host_bridges.len() as u32;
-        let mapping_count: u32 = if has_its { 1 } else { 0 };
+        let smmu_node_count = smmu_configs.len() as u32;
+        let node_count = its_node_count + smmu_node_count + self.pcie_host_bridges.len() as u32;
+        // Each RC gets one ID mapping when there's a target node (SMMU or ITS).
+        let rc_mapping_count: u32 = if has_smmu || has_its { 1 } else { 0 };
 
         let mut iort_extra: Vec<u8> = Vec::new();
 
-        // ITS Group node comes first so root complexes can reference it.
+        // ITS Group node comes first so other nodes can reference it.
         // The ITS Group node offset (from table start) is IORT_NODE_OFFSET.
         let its_group_offset = iort::IORT_NODE_OFFSET;
         if let Some(id) = its_id {
@@ -380,21 +409,103 @@ impl<T: AcpiTopology> AcpiTablesBuilder<'_, T> {
             iort_extra.extend_from_slice(&id.to_ne_bytes());
         }
 
+        // SMMUv3 nodes come after ITS Group (if present).
+        // Build a map from RC index → SMMU node offset for RC routing.
+        let mut smmu_rc_offsets: Vec<(u32, u32)> = Vec::new();
+        for cfg in smmu_configs {
+            let smmu_node_offset = iort::IORT_NODE_OFFSET + iort_extra.len() as u32;
+            smmu_rc_offsets.push((cfg.rc_index, smmu_node_offset));
+
+            if has_its {
+                // The SMMUv3 node needs two ID mappings when ITS is present:
+                //
+                // [0] Range mapping: translates PCI device stream IDs through
+                //     the SMMU to the ITS. Used by iort_node_map_id() during
+                //     RC → SMMUv3 → ITS traversal for PCI MSI domain discovery.
+                //
+                // [1] Single mapping: identifies the ITS group for the SMMU's
+                //     own MSI domain lookup. Referenced by
+                //     device_id_mapping_index. Linux's iort_set_device_domain()
+                //     requires IORT_ID_SINGLE_MAPPING flag on this entry.
+                //
+                // Both mappings are needed even though the SMMU uses wired SPIs
+                // (IDR0.MSI=0, GSIVs populated) for its own interrupts. The
+                // device_id_mapping is required for Linux's IORT MSI domain
+                // resolution infrastructure, which is independent of the
+                // SMMU's actual interrupt delivery mechanism.
+                let smmu = iort::IortSmmuV3::new_with_device_id_mapping(
+                    0,
+                    cfg.base,
+                    2,
+                    cfg.event_gsiv,
+                    cfg.gerr_gsiv,
+                    1, // device_id_mapping_index → mapping [1]
+                );
+                iort_extra.extend_from_slice(smmu.as_bytes());
+
+                // Mapping [0]: range mapping for PCI device stream IDs.
+                // The output_base applies the segment offset so the ITS
+                // receives globally unique device IDs: (segment << 16) | BDF.
+                // Stream IDs within this SMMU are plain BDFs (0-based).
+                iort_extra.extend_from_slice(
+                    iort::IortIdMapping::new(
+                        0,                          // input_base
+                        0xFFFF,                     // id_count (16-bit BDF range)
+                        (cfg.segment as u32) << 16, // output_base
+                        its_group_offset,           // output_reference → ITS group
+                        0,                          // flags
+                    )
+                    .as_bytes(),
+                );
+
+                // Mapping [1]: single mapping for the SMMU's MSI domain.
+                iort_extra.extend_from_slice(
+                    iort::IortIdMapping::new(
+                        0,                            // input_base (unused)
+                        0,                            // id_count (unused)
+                        0,                            // output_base (device ID)
+                        its_group_offset,             // output_reference → ITS group
+                        iort::IORT_ID_SINGLE_MAPPING, // flags
+                    )
+                    .as_bytes(),
+                );
+            } else {
+                let smmu = iort::IortSmmuV3::new(0, cfg.base, 0, cfg.event_gsiv, cfg.gerr_gsiv);
+                iort_extra.extend_from_slice(smmu.as_bytes());
+            }
+        }
+
         for bridge in self.pcie_host_bridges {
-            let rc = iort::IortPciRootComplex::new(bridge.index, bridge.segment, mapping_count);
+            let rc = iort::IortPciRootComplex::new(bridge.index, bridge.segment, rc_mapping_count);
             iort_extra.extend_from_slice(rc.as_bytes());
 
-            if has_its {
-                // Single ID mapping: full RID range → ITS Group node.
-                // output_base uses (segment << 16) so device IDs in the
-                // ITS namespace are unique across PCI segments.
+            if rc_mapping_count > 0 {
+                // Route this RC to its SMMU if one exists,
+                // otherwise directly to the ITS group.
+                let (rc_target_offset, has_smmu) = smmu_rc_offsets
+                    .iter()
+                    .find(|(idx, _)| *idx == bridge.index)
+                    .map(|(_, off)| (*off, true))
+                    .unwrap_or((its_group_offset, false));
+
+                // When the RC has an SMMU, output_base is 0 because stream
+                // IDs are plain BDFs within the per-RC SMMU. The segment
+                // offset is applied in the SMMU→ITS mapping instead.
+                // When the RC goes directly to the ITS, output_base embeds
+                // the segment for globally unique ITS device IDs.
+                let output_base = if has_smmu {
+                    0
+                } else {
+                    (bridge.segment as u32) << 16
+                };
+
                 iort_extra.extend_from_slice(
                     iort::IortIdMapping::new(
-                        0,                             // input_base
-                        0xFFFF, // id_count (full 16-bit BDF range, minus 1 per IORT spec)
-                        (bridge.segment as u32) << 16, // output_base
-                        its_group_offset, // output_reference
-                        0,      // flags
+                        0,                // input_base
+                        0xFFFF,           // id_count (full 16-bit BDF range)
+                        output_base,      // output_base
+                        rc_target_offset, // output_reference
+                        0,                // flags
                     )
                     .as_bytes(),
                 );
@@ -1009,6 +1120,7 @@ mod test {
             arch: AcpiArchConfig::Aarch64 {
                 hypervisor_vendor_identity: 0,
                 virt_timer_ppi: 20,
+                smmu: vec![],
             },
         }
     }
@@ -1144,4 +1256,250 @@ mod test {
         assert!(contains_signature(&tables.tables, b"MCFG"));
         assert!(contains_signature(&tables.tables, b"IORT"));
     }
+
+    fn new_aarch64_builder_with_smmu<'a>(
+        mem_layout: &'a MemoryLayout,
+        processor_topology: &'a ProcessorTopology<Aarch64Topology>,
+        pcie_host_bridges: &'a Vec<PcieHostBridge>,
+        smmu_base: u64,
+    ) -> AcpiTablesBuilder<'a, Aarch64Topology> {
+        AcpiTablesBuilder {
+            processor_topology,
+            mem_layout,
+            cache_topology: None,
+            pcie_host_bridges,
+            arch: AcpiArchConfig::Aarch64 {
+                hypervisor_vendor_identity: 0,
+                virt_timer_ppi: 20,
+                smmu: vec![AcpiSmmuConfig {
+                    rc_index: 0,
+                    segment: 0,
+                    base: smmu_base,
+                    event_gsiv: 35,
+                    gerr_gsiv: 36,
+                }],
+            },
+        }
+    }
+
+    fn u64_at(data: &[u8], offset: usize) -> u64 {
+        u64::from_ne_bytes(data[offset..offset + 8].try_into().unwrap())
+    }
+
+    fn u16_at(data: &[u8], offset: usize) -> u16 {
+        u16::from_ne_bytes(data[offset..offset + 2].try_into().unwrap())
+    }
+
+    #[test]
+    fn test_iort_with_smmu_and_its() {
+        use acpi_spec::iort;
+
+        let mem = new_mem();
+        let topology = new_aarch64_its_topology();
+        let smmu_base: u64 = 0xEFFA_0000;
+        let pcie_host_bridges = vec![PcieHostBridge {
+            index: 0,
+            segment: 0,
+            start_bus: 0,
+            end_bus: 255,
+            ecam_range: MemoryRange::new(0..256 * 256 * 4096),
+            low_mmio: MemoryRange::new(0xdc000000..0xe0000000),
+            high_mmio: MemoryRange::new(0x1000000000..0x1040000000),
+        }];
+        let builder = new_aarch64_builder_with_smmu(&mem, &topology, &pcie_host_bridges, smmu_base);
+
+        let data = builder.build_iort().unwrap();
+
+        // IORT header
+        assert_eq!(&data[0..4], b"IORT");
+        assert_eq!(u32_at(&data, 4) as usize, data.len());
+        assert_eq!(checksum(&data), 0);
+
+        // 3 nodes: ITS Group + SMMUv3 + 1 RC
+        assert_eq!(u32_at(&data, 36), 3);
+
+        // First node: ITS Group at IORT_NODE_OFFSET
+        let its_node = iort::IORT_NODE_OFFSET as usize;
+        assert_eq!(data[its_node], iort::IORT_NODE_TYPE_ITS_GROUP);
+        let its_group_size = 24usize; // 20-byte struct + 4-byte ITS ID
+
+        // Second node: SMMUv3
+        let smmu_node = its_node + its_group_size;
+        assert_eq!(data[smmu_node], iort::IORT_NODE_TYPE_SMMUV3);
+        // base_address at offset 16 from node start
+        assert_eq!(u64_at(&data, smmu_node + 16), smmu_base);
+        // flags: COHACC | DEVICEID_VALID (has ITS mappings)
+        assert_eq!(
+            u32_at(&data, smmu_node + 24),
+            iort::IORT_SMMUV3_FLAG_COHACC | iort::IORT_SMMUV3_FLAG_DEVICEID_VALID
+        );
+        // model: 0 (generic)
+        assert_eq!(u32_at(&data, smmu_node + 36), 0);
+        // mapping_count = 2 (range + single for MSI domain)
+        assert_eq!(u32_at(&data, smmu_node + 8), 2);
+        // device_id_mapping_index = 1
+        assert_eq!(u32_at(&data, smmu_node + 64), 1);
+        // SMMU mapping [0]: range mapping for PCI device stream IDs
+        let smmu_node_len = u16_at(&data, smmu_node + 1) as usize;
+        let smmu_mapping_0 = smmu_node + 68; // IortSmmuV3 is 68 bytes
+        assert_eq!(u32_at(&data, smmu_mapping_0 + 12), iort::IORT_NODE_OFFSET); // → ITS group
+        assert_eq!(u32_at(&data, smmu_mapping_0 + 16), 0); // flags: no SINGLE_MAPPING
+        // SMMU mapping [1]: single mapping for SMMU's own MSI domain
+        let smmu_mapping_1 = smmu_mapping_0 + 20; // IortIdMapping is 20 bytes
+        assert_eq!(u32_at(&data, smmu_mapping_1 + 12), iort::IORT_NODE_OFFSET); // → ITS group
+        assert_eq!(
+            u32_at(&data, smmu_mapping_1 + 16),
+            iort::IORT_ID_SINGLE_MAPPING
+        ); // flags
+
+        // Third node: Root Complex
+        let rc_node = smmu_node + smmu_node_len;
+        assert_eq!(data[rc_node], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX);
+        assert_eq!(u32_at(&data, rc_node + 8), 1); // mapping_count
+        // RC → SMMUv3 mapping
+        let rc_mapping = rc_node + 36;
+        assert_eq!(u32_at(&data, rc_mapping), 0); // input_base
+        assert_eq!(u32_at(&data, rc_mapping + 4), 0xFFFF); // id_count
+        assert_eq!(u32_at(&data, rc_mapping + 8), 0); // output_base (0: has SMMU)
+        assert_eq!(u32_at(&data, rc_mapping + 12), smmu_node as u32); // → SMMUv3
+    }
+
+    #[test]
+    fn test_iort_with_smmu_multi_rc() {
+        use acpi_spec::iort;
+
+        let mem = new_mem();
+        let topology = new_aarch64_its_topology();
+        let smmu_base: u64 = 0xEFFA_0000;
+        let pcie_host_bridges = vec![
+            PcieHostBridge {
+                index: 0,
+                segment: 0,
+                start_bus: 0,
+                end_bus: 255,
+                ecam_range: MemoryRange::new(0..256 * 256 * 4096),
+                low_mmio: MemoryRange::new(0xdc000000..0xe0000000),
+                high_mmio: MemoryRange::new(0x1000000000..0x1040000000),
+            },
+            PcieHostBridge {
+                index: 1,
+                segment: 2,
+                start_bus: 0,
+                end_bus: 63,
+                ecam_range: MemoryRange::new(5 * GB..5 * GB + 64 * 256 * 4096),
+                low_mmio: MemoryRange::new(0xe0000000..0xe4000000),
+                high_mmio: MemoryRange::new(0x1040000000..0x1080000000),
+            },
+        ];
+        let builder = new_aarch64_builder_with_smmu(&mem, &topology, &pcie_host_bridges, smmu_base);
+
+        let data = builder.build_iort().unwrap();
+
+        // 4 nodes: ITS + SMMUv3 + 2 RCs
+        assert_eq!(u32_at(&data, 36), 4);
+        assert_eq!(checksum(&data), 0);
+
+        // ITS Group
+        let its_node = iort::IORT_NODE_OFFSET as usize;
+        let its_group_size = 24usize;
+
+        // SMMUv3 node
+        let smmu_node = its_node + its_group_size;
+        assert_eq!(data[smmu_node], iort::IORT_NODE_TYPE_SMMUV3);
+        let smmu_node_len = u16_at(&data, smmu_node + 1) as usize;
+
+        // RC 0: segment 0 → SMMUv3
+        let rc0 = smmu_node + smmu_node_len;
+        assert_eq!(data[rc0], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX);
+        let rc0_mapping = rc0 + 36;
+        assert_eq!(u32_at(&data, rc0_mapping + 8), 0); // output_base (0: has SMMU)
+        assert_eq!(u32_at(&data, rc0_mapping + 12), smmu_node as u32); // → SMMUv3
+
+        // RC 1: segment 2 → ITS directly (only segment 0 uses SMMU)
+        let rc0_len = u16_at(&data, rc0 + 1) as usize;
+        let rc1 = rc0 + rc0_len;
+        assert_eq!(data[rc1], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX);
+        let rc1_mapping = rc1 + 36;
+        assert_eq!(u32_at(&data, rc1_mapping + 8), 2 << 16); // output_base seg 2
+        assert_eq!(u32_at(&data, rc1_mapping + 12), its_node as u32); // → ITS group
+    }
+
+    #[test]
+    fn test_iort_without_smmu_unchanged() {
+        // Verify the no-SMMU case still produces RC→ITS directly (regression).
+        use acpi_spec::iort;
+
+        let mem = new_mem();
+        let topology = new_aarch64_its_topology();
+        let pcie_host_bridges = vec![PcieHostBridge {
+            index: 0,
+            segment: 0,
+            start_bus: 0,
+            end_bus: 255,
+            ecam_range: MemoryRange::new(0..256 * 256 * 4096),
+            low_mmio: MemoryRange::new(0xdc000000..0xe0000000),
+            high_mmio: MemoryRange::new(0x1000000000..0x1040000000),
+        }];
+        let builder = new_aarch64_builder(&mem, &topology, &pcie_host_bridges);
+
+        let data = builder.build_iort().unwrap();
+
+        // 2 nodes: ITS Group + RC (no SMMUv3)
+        assert_eq!(u32_at(&data, 36), 2);
+
+        // RC mapping points directly to ITS group
+        let its_node = iort::IORT_NODE_OFFSET as usize;
+        let rc_node = its_node + 24; // ITS group = 24 bytes
+        assert_eq!(data[rc_node], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX);
+        let rc_mapping = rc_node + 36;
+        assert_eq!(u32_at(&data, rc_mapping + 12), iort::IORT_NODE_OFFSET); // → ITS group
+    }
+
+    #[test]
+    fn test_iort_smmuv3_node_fields() {
+        use acpi_spec::iort;
+
+        let mem = new_mem();
+        let topology = new_aarch64_its_topology();
+        let smmu_base: u64 = 0xEFFA_0000;
+        let pcie_host_bridges = vec![PcieHostBridge {
+            index: 0,
+            segment: 0,
+            start_bus: 0,
+            end_bus: 255,
+            ecam_range: MemoryRange::new(0..256 * 256 * 4096),
+            low_mmio: MemoryRange::new(0xdc000000..0xe0000000),
+            high_mmio: MemoryRange::new(0x1000000000..0x1040000000),
+        }];
+        let builder = new_aarch64_builder_with_smmu(&mem, &topology, &pcie_host_bridges, smmu_base);
+
+        let data = builder.build_iort().unwrap();
+
+        let smmu_node = iort::IORT_NODE_OFFSET as usize + 24; // after ITS group
+        // Node type
+        assert_eq!(data[smmu_node], iort::IORT_NODE_TYPE_SMMUV3);
+        // Revision
+        assert_eq!(data[smmu_node + 3], iort::IORT_SMMUV3_REVISION);
+        // Base address
+        assert_eq!(u64_at(&data, smmu_node + 16), smmu_base);
+        // Flags: COHACC | DEVICEID_VALID
+        assert_eq!(
+            u32_at(&data, smmu_node + 24),
+            iort::IORT_SMMUV3_FLAG_COHACC | iort::IORT_SMMUV3_FLAG_DEVICEID_VALID
+        );
+        // Reserved
+        assert_eq!(u32_at(&data, smmu_node + 28), 0);
+        // VATOS address = 0
+        assert_eq!(u64_at(&data, smmu_node + 32), 0);
+        // Model = 0 (generic)
+        assert_eq!(
+            u32_at(&data, smmu_node + 40),
+            iort::IORT_SMMUV3_MODEL_GENERIC
+        );
+        // GSIVs: wired SPIs for event and gerror
+        assert_eq!(u32_at(&data, smmu_node + 44), 35); // event_gsiv
+        assert_eq!(u32_at(&data, smmu_node + 48), 0); // pri_gsiv
+        assert_eq!(u32_at(&data, smmu_node + 52), 36); // gerr_gsiv
+        assert_eq!(u32_at(&data, smmu_node + 56), 0); // sync_gsiv
+    }
 }
diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs
index 77b04fdd52..a9c1bfba41 100644
--- a/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs
+++ b/vmm_tests/vmm_tests/tests/tests/multiarch/pcie.rs
@@ -493,3 +493,111 @@ async fn pcie_nvme_boot(config: PetriVmBuilder<OpenVmmPetriBackend>) -> anyhow::
     vm.wait_for_clean_teardown().await?;
     Ok(())
 }
+
+/// Test SMMUv3 IOMMU emulation with a mixed topology:
+///
+/// - Root complex s0rc0 (segment 0): SMMU enabled, virtio-net + NVMe behind it
+/// - Root complex s1rc0 (segment 1): no SMMU, virtio-net behind it
+///
+/// Verifies:
+/// 1. Linux discovers the SMMUv3 (dmesg shows arm-smmu-v3 init)
+/// 2. IORT ACPI table is present
+/// 3. Devices behind the SMMU RC are in IOMMU groups
+/// 4. Devices on both RCs enumerate and function (block I/O, network interfaces)
+/// 5. DMA through SMMU works (NVMe I/O behind the SMMU)
+#[openvmm_test(linux_direct_aarch64)]
+async fn smmu_mixed_topology(config: PetriVmBuilder<OpenVmmPetriBackend>) -> anyhow::Result<()> {
+    let (vm, agent) = config
+        .modify_backend(|b| {
+            b.with_pcie_root_topology(2, 1, 4) // 2 segments, 1 RC each, 4 ports each
+                .with_smmu(&["s0rc0"]) // SMMU only on segment 0's RC
+                .with_pcie_nvme("s0rc0rp0", PCIE_NVME_SUBSYSTEM_IDS[0])
+                .with_virtio_nic("s0rc0rp1")
+                .with_pcie_nvme("s1rc0rp0", PCIE_NVME_SUBSYSTEM_IDS[1])
+                .with_virtio_nic("s1rc0rp1")
+        })
+        .run()
+        .await?;
+
+    let sh = agent.unix_shell();
+
+    // 1. Verify SMMUv3 is discovered by Linux
+    let dmesg = cmd!(sh, "dmesg").read().await?;
+    tracing::info!(dmesg_len = dmesg.len(), "dmesg captured");
+
+    let smmu_lines: Vec<&str> = dmesg
+        .lines()
+        .filter(|l| l.contains("smmu") || l.contains("SMMU") || l.contains("arm-smmu"))
+        .collect();
+    tracing::info!(?smmu_lines, "SMMU-related dmesg lines");
+    assert!(
+        dmesg.contains("arm-smmu-v3"),
+        "Linux should discover the SMMUv3 in dmesg. SMMU lines:\n{}",
+        smmu_lines.join("\n")
+    );
+
+    // 2. Verify IORT ACPI table is present
+    let acpi_tables = cmd!(sh, "ls /sys/firmware/acpi/tables/").read().await?;
+    assert!(
+        acpi_tables.contains("IORT"),
+        "IORT ACPI table should be present. Tables: {acpi_tables}"
+    );
+
+    // 3. Verify IOMMU groups exist (devices behind the SMMU RC)
+    let iommu_groups = cmd!(sh, "ls /sys/kernel/iommu_groups/")
+        .read()
+        .await
+        .unwrap_or_default();
+    tracing::info!(%iommu_groups, "IOMMU groups");
+    assert!(
+        !iommu_groups.trim().is_empty(),
+        "IOMMU groups should exist for devices behind the SMMU"
+    );
+
+    // 4. Verify all NVMe devices enumerate and have block devices
+    let block_devs = cmd!(sh, "ls /sys/block/").read().await?;
+    let nvme_count = block_devs
+        .split_whitespace()
+        .filter(|d| d.starts_with("nvme"))
+        .count();
+    assert_eq!(
+        nvme_count, 2,
+        "both NVMe controllers should create block devices: {block_devs}"
+    );
+
+    // 5. Verify NVMe behind SMMU works: write and read back data
+    //    The NVMe on s0rc0rp0 has DMA going through SMMU translation.
+    //    If the SMMU page tables are not set up correctly, this I/O would fail.
+    let nvme_devs: Vec<&str> = block_devs
+        .split_whitespace()
+        .filter(|d| d.starts_with("nvme"))
+        .collect();
+    if let Some(dev) = nvme_devs.first() {
+        // Write a pattern and read it back to exercise DMA through SMMU
+        cmd!(
+            sh,
+            "dd if=/dev/urandom of=/dev/{dev} bs=4096 count=16 oflag=direct"
+        )
+        .read()
+        .await?;
+        cmd!(
+            sh,
+            "dd if=/dev/{dev} of=/dev/null bs=4096 count=16 iflag=direct"
+        )
+        .read()
+        .await?;
+    }
+
+    // 6. Verify virtio-net interfaces exist on both RCs
+    let net_devs = cmd!(sh, "ls /sys/class/net/").read().await?;
+    let net_count = net_devs.split_whitespace().filter(|d| *d != "lo").count();
+    tracing::info!(%net_devs, net_count, "network devices");
+    assert!(
+        net_count >= 2,
+        "at least 2 network interfaces should exist (got {net_count}): {net_devs}"
+    );
+
+    agent.power_off().await?;
+    vm.wait_for_clean_teardown().await?;
+    Ok(())
+}

From a3682507eaa651c2cf80fe90213ac90d8bc8df52 Mon Sep 17 00:00:00 2001
From: John Starks <jostarks@microsoft.com>
Date: Tue, 19 May 2026 16:22:52 -0700
Subject: [PATCH 7/7] cleanup

---
 openvmm/openvmm_core/src/worker/dispatch.rs   | 113 +++++-------------
 .../src/worker/dispatch/smmu_wiring.rs        | 100 ++++++++++++++++
 2 files changed, 131 insertions(+), 82 deletions(-)
 create mode 100644 openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs

diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs
index a1a9044f16..0cd1a38c54 100644
--- a/openvmm/openvmm_core/src/worker/dispatch.rs
+++ b/openvmm/openvmm_core/src/worker/dispatch.rs
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+mod smmu_wiring;
+
 use crate::emuplat;
 use crate::partition::BindHvliteVp;
 use crate::partition::HvlitePartition;
@@ -407,22 +409,12 @@ pub(crate) struct InitializedVm {
     chipset_low_mmio: MemoryRange,
     chipset_high_mmio: MemoryRange,
     vtl2_chipset_mmio: MemoryRange,
-    resolved_smmu_resources: Vec<ResolvedSmmuResources>,
+    resolved_smmu_resources: Vec<smmu_wiring::ResolvedSmmuResources>,
     processor_topology: ProcessorTopology,
     igvm_file: Option<IgvmFile>,
     driver_source: VmTaskDriverSource,
 }
 
-/// Resolved resources for a single SMMUv3 instance.
-struct ResolvedSmmuResources {
-    /// MMIO base address (from the memory layout allocator).
-    base: u64,
-    /// GIC INTID for the event queue interrupt (from the SPI allocator).
-    evtq_gsiv: u32,
-    /// GIC INTID for the global error interrupt (from the SPI allocator).
-    gerr_gsiv: u32,
-}
-
 trait ExtractTopologyConfig {
     fn to_config(&self) -> ProcessorTopologyConfig;
 }
@@ -515,6 +507,7 @@ impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
 struct Aarch64TopologyResult {
     processor_topology: ProcessorTopology<Aarch64Topology>,
     spi_layout: super::spi_layout::ResolvedSpiLayout,
+    smmu_count: usize,
 }
 
 #[cfg(guest_arch = "aarch64")]
@@ -654,6 +647,8 @@ fn build_aarch64_topology(
         gic_nr_irqs,
     };
 
+    let smmu_count = arch.smmu.len();
+
     let mut builder = TopologyBuilder::new_aarch64(platform);
     if let Some(smt) = config.enable_smt {
         builder.smt_enabled(smt);
@@ -666,6 +661,7 @@ fn build_aarch64_topology(
     Ok(Aarch64TopologyResult {
         processor_topology: builder.build(config.proc_count)?,
         spi_layout,
+        smmu_count,
     })
 }
 
@@ -894,9 +890,13 @@ impl InitializedVm {
         };
 
         #[cfg(guest_arch = "aarch64")]
-        let (processor_topology, spi_layout) = {
+        let (processor_topology, spi_layout, smmu_count) = {
             let result = build_aarch64_topology(&cfg.processor_topology, &platform_info)?;
-            (result.processor_topology, result.spi_layout)
+            (
+                result.processor_topology,
+                result.spi_layout,
+                result.smmu_count,
+            )
         };
         #[cfg(not(guest_arch = "aarch64"))]
         let processor_topology = build_x86_topology(&cfg.processor_topology)?;
@@ -956,22 +956,9 @@ impl InitializedVm {
             .filter(|(bus, _)| matches!(bus, VirtioBus::Mmio))
             .count();
 
-        // Count SMMU instances so the layout engine can allocate their MMIO.
-        let smmu_count = {
-            #[cfg(guest_arch = "aarch64")]
-            {
-                match &cfg.processor_topology.arch {
-                    Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig { smmu, .. })) => {
-                        smmu.len()
-                    }
-                    _ => 0,
-                }
-            }
-            #[cfg(not(guest_arch = "aarch64"))]
-            {
-                0
-            }
-        };
+        // smmu_count was already computed by build_aarch64_topology.
+        #[cfg(not(guest_arch = "aarch64"))]
+        let smmu_count = 0;
 
         let resolved_layout = resolve_memory_layout(MemoryLayoutInput {
             mem_size: cfg.memory.mem_size,
@@ -992,21 +979,11 @@ impl InitializedVm {
         let vtl2_chipset_mmio = resolved_layout.vtl2_chipset_mmio;
 
         // Combine SMMU MMIO ranges with SPI layout.
-        cfg_if! {
-            if #[cfg(guest_arch = "aarch64")] {
-                let resolved_smmu_resources: Vec<ResolvedSmmuResources> = resolved_layout.smmu_ranges
-                    .iter()
-                    .zip(&spi_layout.smmu)
-                    .map(|(range, spis)| ResolvedSmmuResources {
-                        base: range.start(),
-                        evtq_gsiv: spis.evtq_gsiv,
-                        gerr_gsiv: spis.gerr_gsiv,
-                    })
-                    .collect();
-            } else {
-                let resolved_smmu_resources: Vec<ResolvedSmmuResources> = Vec::new();
-            }
-        }
+        #[cfg(guest_arch = "aarch64")]
+        let resolved_smmu_resources =
+            smmu_wiring::resolve_smmu_resources(&resolved_layout.smmu_ranges, &spi_layout);
+        #[cfg(not(guest_arch = "aarch64"))]
+        let resolved_smmu_resources: Vec<smmu_wiring::ResolvedSmmuResources> = Vec::new();
 
         // Place the alias map at the end of the address space. Newer versions
         // of OpenHCL support receiving this offset via devicetree (especially
@@ -2111,40 +2088,13 @@ impl InitializedVm {
             });
         }
 
-        // Build a port-name → SMMU shared state map. Each downstream port of
-        // an SMMU-covered root complex inherits that SMMU.
-        let smmu_port_map: std::collections::HashMap<Arc<str>, Arc<smmu::SmmuSharedState>> =
-            smmu_shared_states
-                .iter()
-                .zip(pcie_root_complexes.iter())
-                .flat_map(|(shared, rc)| {
-                    let shared = shared.clone();
-                    rc.lock()
-                        .downstream_ports()
-                        .into_iter()
-                        .filter_map(move |dpi| shared.as_ref().map(|s| (dpi.name, s.clone())))
-                })
-                .collect();
-
-        // Track which RCs have SMMUs (for VFIO blocking).
-        let mut smmu_per_rc = vec![false; pcie_host_bridges.len()];
-        for inst in &smmu_instances {
-            if let Some(&i) = pcie_rc_name_to_idx.get(&inst.rc_name) {
-                smmu_per_rc[i] = true;
-            }
-        }
-
-        // Build port-name set for ports behind SMMUs.
-        let smmu_s1_ports: std::collections::HashSet<Arc<str>> = smmu_per_rc
-            .iter()
-            .zip(pcie_root_complexes.iter())
-            .flat_map(|(&has_smmu, rc)| {
-                rc.lock()
-                    .downstream_ports()
-                    .into_iter()
-                    .filter_map(move |dpi| if has_smmu { Some(dpi.name) } else { None })
-            })
-            .collect();
+        let smmu_port_maps = smmu_wiring::build_smmu_port_maps(
+            &smmu_shared_states,
+            &pcie_root_complexes,
+            &smmu_instances,
+            &pcie_rc_name_to_idx,
+            &pcie_host_bridges,
+        );
 
         // Resolve PCIe devices concurrently.
         //
@@ -2163,8 +2113,7 @@ impl InitializedVm {
             let partition = &partition;
             let mapper = &mapper;
             let port_info = &port_info;
-            let smmu_port_map = &smmu_port_map;
-            let smmu_s1_ports = &smmu_s1_ports;
+            let smmu_port_maps = &smmu_port_maps;
             async move {
                 let port_name: Arc<str> = dev_cfg.port_name.into();
                 let pi = port_info.get(&port_name).ok_or_else(|| {
@@ -2179,7 +2128,7 @@ impl InitializedVm {
                 // into the host IOMMU, so VFIO DMA would bypass S1
                 // translation. This will be lifted when iommufd
                 // nested translation support is available.
-                if dev_cfg.resource.id() == "vfio" && smmu_s1_ports.contains(&port_name) {
+                if dev_cfg.resource.id() == "vfio" && smmu_port_maps.s1_ports.contains(&port_name) {
                     anyhow::bail!(
                         "VFIO device on port {:?} is behind an S1-capable SMMU, \
                          but iommufd nested translation is not available. \
@@ -2198,7 +2147,7 @@ impl InitializedVm {
                     &pi.bus_range,
                     pi.segment,
                     use_its,
-                    smmu_port_map.get(&port_name),
+                    smmu_port_maps.port_map.get(&port_name),
                 );
 
                 vmm_core::device_builder::build_pcie_device(
diff --git a/openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs b/openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs
new file mode 100644
index 0000000000..b58e61187a
--- /dev/null
+++ b/openvmm/openvmm_core/src/worker/dispatch/smmu_wiring.rs
@@ -0,0 +1,100 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! SMMU resource resolution and wiring helpers for aarch64 VMs.
+//!
+//! This module handles combining SMMU MMIO ranges (from the memory layout
+//! allocator) with SPI assignments (from the SPI allocator) into resolved
+//! resources, and building the lookup maps needed for per-device wiring.
+
+use closeable_mutex::CloseableMutex;
+use pcie::root::GenericPcieRootComplex;
+use std::collections::HashMap;
+use std::collections::HashSet;
+use std::sync::Arc;
+use vm_topology::pcie::PcieHostBridge;
+
+/// Resolved resources for a single SMMUv3 instance, combining MMIO and SPI
+/// allocations.
+pub(super) struct ResolvedSmmuResources {
+    /// MMIO base address (from the memory layout allocator).
+    pub base: u64,
+    /// GIC INTID for the event queue interrupt (from the SPI allocator).
+    pub evtq_gsiv: u32,
+    /// GIC INTID for the global error interrupt (from the SPI allocator).
+    pub gerr_gsiv: u32,
+}
+
+/// Combines SMMU MMIO ranges from the memory layout with SPI assignments from
+/// the SPI layout into resolved resources.
+#[cfg(guest_arch = "aarch64")]
+pub(super) fn resolve_smmu_resources(
+    smmu_ranges: &[memory_range::MemoryRange],
+    spi_layout: &crate::worker::spi_layout::ResolvedSpiLayout,
+) -> Vec<ResolvedSmmuResources> {
+    smmu_ranges
+        .iter()
+        .zip(&spi_layout.smmu)
+        .map(|(range, spis)| ResolvedSmmuResources {
+            base: range.start(),
+            evtq_gsiv: spis.evtq_gsiv,
+            gerr_gsiv: spis.gerr_gsiv,
+        })
+        .collect()
+}
+
+/// Lookup maps for SMMU-covered PCIe ports, used during device wiring and
+/// VFIO validation.
+pub(super) struct SmmuPortMaps {
+    /// Maps port names to their SMMU shared state (for per-device wrapping).
+    pub port_map: HashMap<Arc<str>, Arc<smmu::SmmuSharedState>>,
+    /// Set of port names behind S1-capable SMMUs (for VFIO blocking).
+    pub s1_ports: HashSet<Arc<str>>,
+}
+
+/// Builds the port-level SMMU lookup maps from per-RC shared state.
+///
+/// `smmu_shared_states` is indexed parallel to `pcie_host_bridges` / `pcie_root_complexes`,
+/// with `None` for root complexes that have no SMMU.
+pub(super) fn build_smmu_port_maps(
+    smmu_shared_states: &[Option<Arc<smmu::SmmuSharedState>>],
+    pcie_root_complexes: &[Arc<CloseableMutex<GenericPcieRootComplex>>],
+    smmu_instances: &[openvmm_defs::config::SmmuInstanceConfig],
+    pcie_rc_name_to_idx: &HashMap<String, usize>,
+    pcie_host_bridges: &[PcieHostBridge],
+) -> SmmuPortMaps {
+    // Build a port-name → SMMU shared state map. Each downstream port of
+    // an SMMU-covered root complex inherits that SMMU.
+    let port_map: HashMap<Arc<str>, Arc<smmu::SmmuSharedState>> = smmu_shared_states
+        .iter()
+        .zip(pcie_root_complexes.iter())
+        .flat_map(|(shared, rc)| {
+            let shared = shared.clone();
+            rc.lock()
+                .downstream_ports()
+                .into_iter()
+                .filter_map(move |dpi| shared.as_ref().map(|s| (dpi.name, s.clone())))
+        })
+        .collect();
+
+    // Track which RCs have SMMUs, then collect port names behind them.
+    let mut smmu_per_rc = vec![false; pcie_host_bridges.len()];
+    for inst in smmu_instances {
+        if let Some(&i) = pcie_rc_name_to_idx.get(&inst.rc_name) {
+            smmu_per_rc[i] = true;
+        }
+    }
+
+    let s1_ports: HashSet<Arc<str>> = smmu_per_rc
+        .iter()
+        .zip(pcie_root_complexes.iter())
+        .flat_map(|(&has_smmu, rc)| {
+            rc.lock()
+                .downstream_ports()
+                .into_iter()
+                .filter_map(move |dpi| if has_smmu { Some(dpi.name) } else { None })
+        })
+        .collect();
+
+    SmmuPortMaps { port_map, s1_ports }
+}