From b26efcfdc061a54c577df9f3fb3c4c8838b962b5 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 12 May 2026 16:05:19 -0700 Subject: [PATCH 01/10] vfio: add VFIO cdev + iommufd device assignment path Add support for the modern VFIO cdev + iommufd device assignment interface alongside the existing legacy group/container + Type1v2 path. The cdev interface (/dev/vfio/devices/vfioN) provides per-device file descriptors instead of the legacy group model, and iommufd (/dev/iommu) replaces the VFIO Type1v2 container for DMA mapping via IOAS objects. Both paths coexist--the user selects the backend per device via CLI. The CLI adds two new flags: - `--iommu id=` creates an iommufd context - `--vfio host=,port=[,iommu=]` assigns a device, optionally referencing an iommufd context for the cdev path The old `--vfio :` positional syntax is removed in favor of the key-value format. When `iommu=` is specified, the launcher opens the cdev device node and an iommufd fd, producing a `VfioCdevDeviceHandle` resource. A new `VfioCdevDeviceResolver` handles resolution: it allocates an IOAS, binds the cdev device to iommufd, attaches the device to the IOAS, and registers an `IommufdDmaTarget` with the region manager for identity DMA mapping. The resulting `VfioAssignedPciDevice` is identical regardless of which path opened the device. Config space, BAR mapping, and MSI-X emulation are shared. --- openvmm/openvmm_core/src/worker/dispatch.rs | 15 +- openvmm/openvmm_entry/src/cli_args.rs | 98 ++++++-- openvmm/openvmm_entry/src/lib.rs | 123 +++++++--- .../pci/vfio_assigned_device/src/lib.rs | 45 +++- .../pci/vfio_assigned_device/src/manager.rs | 179 ++++++++++++++ .../pci/vfio_assigned_device/src/resolver.rs | 63 +++++ .../vfio_assigned_device_resources/src/lib.rs | 22 +- vm/devices/user_driver/vfio_sys/src/cdev.rs | 201 ++++++++++++++++ .../user_driver/vfio_sys/src/iommufd.rs | 223 ++++++++++++++++++ vm/devices/user_driver/vfio_sys/src/lib.rs | 3 + 10 files changed, 924 insertions(+), 48 deletions(-) create mode 100644 vm/devices/user_driver/vfio_sys/src/cdev.rs create mode 100644 vm/devices/user_driver/vfio_sys/src/iommufd.rs diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 4cbe51c1b8..6226cf2206 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -1954,9 +1954,10 @@ impl InitializedVm { // internally to share containers across assigned devices. #[cfg(target_os = "linux")] let vfio_inspect = { + let dma_mapper_client = memory_manager.dma_mapper_client(); let vfio_resolver = vfio_assigned_device::resolver::VfioDeviceResolver::new( driver_source.builder().build("vfio-container-mgr"), - memory_manager.dma_mapper_client(), + dma_mapper_client.clone(), ); let handle = vfio_resolver.inspect_handle(); resolver.add_async_resolver::< @@ -1965,6 +1966,18 @@ impl InitializedVm { vfio_assigned_device_resources::VfioDeviceHandle, _, >(vfio_resolver); + + // Register the VFIO cdev + iommufd resolver for devices opened + // via the cdev interface. + let cdev_resolver = + vfio_assigned_device::resolver::VfioCdevDeviceResolver::new(dma_mapper_client); + resolver.add_async_resolver::< + vm_resource::kind::PciDeviceHandleKind, + _, + vfio_assigned_device_resources::VfioCdevDeviceHandle, + _, + >(cdev_resolver); + Some(handle) }; diff --git a/openvmm/openvmm_entry/src/cli_args.rs b/openvmm/openvmm_entry/src/cli_args.rs index b9889e8ad5..e76d1e5cf8 100644 --- a/openvmm/openvmm_entry/src/cli_args.rs +++ b/openvmm/openvmm_entry/src/cli_args.rs @@ -904,18 +904,34 @@ Assign a host PCI device to the guest via Linux VFIO. The device must be bound to vfio-pci on the host before starting the VM. Examples: - # Assign NVMe controller to root port rp0 - --vfio rp0:0000:01:00.0 + --vfio host=0000:01:00.0,port=rp0 + --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 + +Keys: + host= (required) PCI address on the host + port= (required) Root port or downstream switch port name + iommu= (optional) Reference to an --iommu object. When present, + uses VFIO cdev + iommufd instead of the legacy group path. +"#)] + #[cfg(target_os = "linux")] + #[clap(long, conflicts_with("pcat"))] + pub vfio: Vec, -Syntax: : + /// Create an iommufd context for VFIO cdev device assignment + #[clap(long_help = r#" +Create an iommufd context. Opens /dev/iommu and allocates an IOAS. +Referenced by --vfio devices via the iommu= key. - port_name Root port or downstream switch port name - pci_bdf PCI domain:bus:device.function of the VFIO device on - the host (use lspci -D to find it) +Requires Linux kernel >= 6.6 with iommufd support. + +Examples: + --iommu id=iommu0 --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 + +Syntax: id= "#)] #[cfg(target_os = "linux")] #[clap(long, conflicts_with("pcat"))] - pub vfio: Vec, + pub iommu: Vec, } impl Options { @@ -2427,6 +2443,8 @@ impl FromStr for PcieRemoteCli { } /// CLI configuration for a VFIO-assigned PCI device. +/// +/// Syntax: `host=,port=[,iommu=]` #[cfg(target_os = "linux")] #[derive(Clone, Debug)] pub struct VfioDeviceCli { @@ -2434,6 +2452,9 @@ pub struct VfioDeviceCli { pub port_name: String, /// PCI BDF address of the device on the host (e.g., "0000:01:00.0"). pub pci_id: String, + /// Optional iommufd context ID. When set, uses VFIO cdev + iommufd + /// instead of the legacy group/container path. + pub iommu: Option, } #[cfg(target_os = "linux")] @@ -2441,16 +2462,30 @@ impl FromStr for VfioDeviceCli { type Err = anyhow::Error; fn from_str(s: &str) -> Result { - let (port_name, pci_id) = s - .split_once(':') - .context("expected : (e.g., rp0:0000:01:00.0)")?; - - if port_name.is_empty() { - anyhow::bail!("port name cannot be empty"); + let mut host = None; + let mut port = None; + let mut iommu = None; + + for kv in s.split(',') { + let (key, value) = kv + .split_once('=') + .context("expected key=value pair (e.g., host=0000:01:00.0,port=rp0)")?; + match key { + "host" => host = Some(value.to_string()), + "port" => port = Some(value.to_string()), + "iommu" => iommu = Some(value.to_string()), + _ => anyhow::bail!("unknown --vfio key: '{key}'"), + } } + let pci_id = host.context("--vfio: 'host=' is required")?; + let port_name = port.context("--vfio: 'port=' is required")?; + if pci_id.is_empty() { - anyhow::bail!("PCI address cannot be empty"); + anyhow::bail!("host PCI address cannot be empty"); + } + if port_name.is_empty() { + anyhow::bail!("port name cannot be empty"); } // Reject path separators to prevent sysfs path traversal via Path::join. @@ -2459,8 +2494,39 @@ impl FromStr for VfioDeviceCli { } Ok(VfioDeviceCli { - port_name: port_name.to_string(), - pci_id: pci_id.to_string(), + port_name, + pci_id, + iommu, + }) + } +} + +/// CLI configuration for an iommufd context. +/// +/// Syntax: `id=` +#[cfg(target_os = "linux")] +#[derive(Clone, Debug)] +pub struct IommuCli { + /// Unique identifier for this iommufd context. + pub id: String, +} + +#[cfg(target_os = "linux")] +impl FromStr for IommuCli { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let (key, value) = s + .split_once('=') + .context("expected id= (e.g., id=iommu0)")?; + if key != "id" { + anyhow::bail!("expected 'id=', got '{key}=...'"); + } + if value.is_empty() { + anyhow::bail!("iommu id cannot be empty"); + } + Ok(IommuCli { + id: value.to_string(), }) } } diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 6afbc14913..0a87c33a06 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -757,37 +757,106 @@ async fn vm_config_from_command_line( let pcie_switches = build_switch_list(&opt.pcie_switch); #[cfg(target_os = "linux")] - let vfio_pcie_devices: Vec = opt - .vfio - .iter() - .map(|cli_cfg| { - use vm_resource::IntoResource; - - let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id); - let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group")) - .with_context(|| format!("failed to read IOMMU group for {}", cli_cfg.pci_id))?; - let group_id: u64 = iommu_group_link - .file_name() - .and_then(|s| s.to_str()) - .context("invalid iommu_group symlink")? - .parse() - .context("failed to parse IOMMU group ID")?; - let group = std::fs::OpenOptions::new() + let vfio_pcie_devices: Vec = { + use std::collections::HashMap; + use vm_resource::IntoResource; + + // Process --iommu flags: open /dev/iommu for each declared context. + let mut iommu_map: HashMap = HashMap::new(); + for iommu_cli in &opt.iommu { + anyhow::ensure!( + !iommu_map.contains_key(&iommu_cli.id), + "duplicate --iommu id={}", + iommu_cli.id + ); + let file = std::fs::OpenOptions::new() .read(true) .write(true) - .open(format!("/dev/vfio/{group_id}")) - .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?; - - Ok(PcieDeviceConfig { - port_name: cli_cfg.port_name.clone(), - resource: vfio_assigned_device_resources::VfioDeviceHandle { - pci_id: cli_cfg.pci_id.clone(), - group, + .open("/dev/iommu") + .context("failed to open /dev/iommu (is iommufd available?)")?; + iommu_map.insert(iommu_cli.id.clone(), file); + } + + opt.vfio + .iter() + .map(|cli_cfg| { + let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id); + + if let Some(iommu_id) = &cli_cfg.iommu { + // cdev + iommufd path + let iommufd = iommu_map.get(iommu_id).with_context(|| { + format!( + "--vfio device {} references iommu={iommu_id}, \ + but no --iommu id={iommu_id} was specified", + cli_cfg.pci_id + ) + })?; + // Open a cloned iommufd fd (each device binding needs its own fd + // for VFIO_DEVICE_BIND_IOMMUFD, but they can share the same + // underlying iommufd kernel object via dup). + let iommufd = iommufd.try_clone().with_context(|| { + format!("failed to dup iommufd fd for iommu={iommu_id}") + })?; + + // Open the cdev device node. + let vfio_dev_dir = sysfs_path.join("vfio-dev"); + let entry = std::fs::read_dir(&vfio_dev_dir) + .with_context(|| { + format!( + "failed to read {}: is {} bound to vfio-pci?", + vfio_dev_dir.display(), + cli_cfg.pci_id + ) + })? + .next() + .context("no vfio-dev entry found")? + .context("failed to read vfio-dev entry")?; + let dev_path = Path::new("/dev/vfio/devices").join(entry.file_name()); + let cdev = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(&dev_path) + .with_context(|| format!("failed to open {}", dev_path.display()))?; + + Ok(PcieDeviceConfig { + port_name: cli_cfg.port_name.clone(), + resource: vfio_assigned_device_resources::VfioCdevDeviceHandle { + pci_id: cli_cfg.pci_id.clone(), + cdev, + iommufd, + } + .into_resource(), + }) + } else { + // Legacy group/container path + let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group")) + .with_context(|| { + format!("failed to read IOMMU group for {}", cli_cfg.pci_id) + })?; + let group_id: u64 = iommu_group_link + .file_name() + .and_then(|s| s.to_str()) + .context("invalid iommu_group symlink")? + .parse() + .context("failed to parse IOMMU group ID")?; + let group = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(format!("/dev/vfio/{group_id}")) + .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?; + + Ok(PcieDeviceConfig { + port_name: cli_cfg.port_name.clone(), + resource: vfio_assigned_device_resources::VfioDeviceHandle { + pci_id: cli_cfg.pci_id.clone(), + group, + } + .into_resource(), + }) } - .into_resource(), }) - }) - .collect::>>()?; + .collect::>>()? + }; #[cfg(windows)] let vpci_resources: Vec<_> = opt diff --git a/vm/devices/pci/vfio_assigned_device/src/lib.rs b/vm/devices/pci/vfio_assigned_device/src/lib.rs index 6d611c4188..2b86c0dba3 100644 --- a/vm/devices/pci/vfio_assigned_device/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device/src/lib.rs @@ -194,9 +194,9 @@ pub(crate) struct VfioAssignedPciDevice { )] config_patches: BTreeMap, - /// VFIO container/group binding. Keeps the container and group fds alive - /// and notifies the container manager on drop. - binding: manager::VfioDeviceBinding, + /// VFIO binding. Keeps the container/group (legacy) or iommufd/IOAS + /// (cdev) fds alive and cleans up on drop. + binding: manager::VfioBinding, } impl VfioAssignedPciDevice { @@ -231,6 +231,45 @@ impl VfioAssignedPciDevice { .await .with_context(|| format!("failed to open VFIO device {pci_id}"))?; + Self::from_device( + vfio_device, + manager::VfioBinding::Group(binding), + pci_id, + register_mmio, + msi_target, + memory_mapper, + ) + .await + } + + /// Create from a pre-opened VFIO device and a cdev binding. + pub async fn from_cdev( + cdev_binding: manager::VfioCdevBinding, + pci_id: String, + register_mmio: &mut (dyn chipset_device::mmio::RegisterMmioIntercept + Send), + msi_target: &MsiTarget, + memory_mapper: &dyn MemoryMapper, + ) -> anyhow::Result { + let (device, binding) = cdev_binding.into_parts(); + Self::from_device( + device, + manager::VfioBinding::Cdev(binding), + pci_id, + register_mmio, + msi_target, + memory_mapper, + ) + .await + } + + async fn from_device( + vfio_device: vfio_sys::Device, + binding: manager::VfioBinding, + pci_id: String, + register_mmio: &mut (dyn chipset_device::mmio::RegisterMmioIntercept + Send), + msi_target: &MsiTarget, + memory_mapper: &dyn MemoryMapper, + ) -> anyhow::Result { let config_info = vfio_device .region_info(vfio_bindings::bindings::vfio::VFIO_PCI_CONFIG_REGION_INDEX) .context("failed to get VFIO config region info")?; diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 1385324fc5..27a1f624de 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -17,6 +17,7 @@ use mesh::rpc::FailableRpc; use mesh::rpc::RpcSend as _; use std::collections::HashMap; use std::fs::File; +use std::os::unix::prelude::*; use std::sync::Arc; /// Implements [`membacking::DmaTarget`] for VFIO type1 IOMMU containers. @@ -456,3 +457,181 @@ impl VfioContainerManager { } } } + +// --- iommufd / cdev support --- + +/// Implements [`membacking::DmaTarget`] for iommufd IOAS-based DMA mapping. +/// +/// Like `VfioType1DmaTarget`, this uses host virtual addresses for mapping, +/// but calls `IOMMU_IOAS_MAP`/`IOMMU_IOAS_UNMAP` on the iommufd fd instead +/// of `VFIO_IOMMU_MAP_DMA`/`VFIO_IOMMU_UNMAP_DMA` on a VFIO container fd. +struct IommufdDmaTarget { + ctx: Arc, + ioas_id: u32, +} + +impl membacking::DmaTarget for IommufdDmaTarget { + unsafe fn map_dma( + &self, + range: memory_range::MemoryRange, + host_va: Option<*const u8>, + _mappable: &membacking::Mappable, + _file_offset: u64, + ) -> anyhow::Result<()> { + let vaddr = + host_va.expect("iommufd IOAS map requires host VA (registered with needs_va=true)"); + let iova = range.start(); + let user_va = vaddr as u64; + let length = range.len(); + // SAFETY: The caller (DmaMapper in membacking) guarantees that the + // host VA is backed and stable via ensure_mapped + VaMapper lifetime. + unsafe { + self.ctx + .ioas_map(self.ioas_id, iova, user_va, length) + .with_context(|| { + format!( + "iommufd IOAS DMA map failed: iova={iova:#x} user_va={user_va:#x} \ + length={length:#x} ioas_id={}", + self.ioas_id + ) + }) + } + } + + fn unmap_dma(&self, range: memory_range::MemoryRange) -> anyhow::Result<()> { + let _span = tracing::info_span!("iommufd unmap", %range).entered(); + self.ctx + .ioas_unmap(self.ioas_id, range.start(), range.len()) + .context("iommufd IOAS DMA unmap failed")?; + Ok(()) + } +} + +/// Binding for a VFIO device opened via the cdev + iommufd path. +/// +/// Analogous to [`VfioDeviceBinding`] for the legacy group path, but +/// uses iommufd for DMA mapping instead of a VFIO type1 container. +/// +/// The device is bound to iommufd and attached to an IOAS. When this +/// binding is dropped, the device is automatically detached and the +/// iommufd binding is released (the kernel cleans up when the cdev fd +/// closes). +#[derive(Inspect)] +pub(crate) struct VfioCdevBinding { + /// PCI BDF address on the host. + pci_id: String, + /// VFIO cdev device — provides config space, BAR, IRQ ioctls. + /// The cdev fd owns the iommufd binding (released on close). + #[inspect(skip)] + device: vfio_sys::Device, + /// iommufd device ID (from `VFIO_DEVICE_BIND_IOMMUFD`). + iommufd_devid: u32, + /// IOAS ID this device is attached to. + ioas_id: u32, + /// DMA mapper registration — removes the mapper on drop. + #[inspect(skip)] + _dma_handle: membacking::DmaMapperHandle, +} + +impl VfioCdevBinding { + /// Open a VFIO cdev device, bind to iommufd, allocate an IOAS, attach + /// the device, and register the IOAS as a DMA mapper. + pub async fn new( + pci_id: String, + cdev_file: File, + iommufd_file: File, + dma_mapper_client: &DmaMapperClient, + ) -> anyhow::Result { + let ctx = Arc::new(vfio_sys::iommufd::IommufdCtx::from_file(iommufd_file)); + let cdev = vfio_sys::cdev::CdevDevice::from_file(cdev_file); + + // Allocate an IOAS for this device's DMA. + let ioas_id = ctx + .ioas_alloc() + .context("failed to allocate iommufd IOAS")?; + + // Bind the cdev device to iommufd. + let devid = cdev + .bind_iommufd(ctx.as_raw_fd()) + .context("failed to bind VFIO cdev to iommufd")?; + + // Attach the device to the IOAS. + cdev.attach_ioas(ioas_id) + .context("failed to attach cdev device to IOAS")?; + + tracing::info!( + pci_id, + iommufd_devid = devid, + ioas_id, + "VFIO cdev device bound to iommufd" + ); + + // Register the IOAS as a DMA target — the region manager will + // replay all existing guest RAM mappings into it. + let dma_target: Arc = Arc::new(IommufdDmaTarget { + ctx: ctx.clone(), + ioas_id, + }); + let dma_handle = dma_mapper_client + .add_dma_mapper(dma_target, true) + .await + .context("failed to register iommufd IOAS with region manager")?; + + let device = cdev.into_device(); + + Ok(Self { + pci_id, + device, + iommufd_devid: devid, + ioas_id, + _dma_handle: dma_handle, + }) + } + + /// Consume the binding and split into the `Device` (for constructing + /// `VfioAssignedPciDevice`) and the remaining binding state (for + /// lifetime management). + pub fn into_parts(self) -> (vfio_sys::Device, VfioCdevBindingState) { + ( + self.device, + VfioCdevBindingState { + _pci_id: self.pci_id, + _iommufd_devid: self.iommufd_devid, + _ioas_id: self.ioas_id, + _dma_handle: self._dma_handle, + }, + ) + } +} + +/// The iommufd-related state from a [`VfioCdevBinding`], kept alive for +/// the lifetime of the assigned device. +/// +/// Dropping this removes the DMA mapper from the region manager and +/// allows the kernel to clean up iommufd objects. +#[derive(Inspect)] +pub(crate) struct VfioCdevBindingState { + _pci_id: String, + _iommufd_devid: u32, + _ioas_id: u32, + #[inspect(skip)] + _dma_handle: membacking::DmaMapperHandle, +} + +/// Wrapper enum for either legacy group or cdev iommufd binding. +/// +/// Kept as a field on `VfioAssignedPciDevice` to hold the underlying +/// fd/handle resources alive for the device's lifetime. +pub(crate) enum VfioBinding { + Group(VfioDeviceBinding), + Cdev(VfioCdevBindingState), +} + +impl Inspect for VfioBinding { + fn inspect(&self, req: inspect::Request<'_>) { + match self { + VfioBinding::Group(b) => b.inspect(req), + VfioBinding::Cdev(b) => b.inspect(req), + } + } +} diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index 089744e51e..f994f3de80 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -11,6 +11,7 @@ use async_trait::async_trait; use membacking::DmaMapperClient; use pci_resources::ResolvePciDeviceHandleParams; use pci_resources::ResolvedPciDevice; +use vfio_assigned_device_resources::VfioCdevDeviceHandle; use vfio_assigned_device_resources::VfioDeviceHandle; use vm_resource::AsyncResolveResource; use vm_resource::ResourceResolver; @@ -87,3 +88,65 @@ impl AsyncResolveResource for VfioDeviceR Ok(device.into()) } } + +/// Resource resolver for [`VfioCdevDeviceHandle`] (cdev + iommufd path). +/// +/// Unlike the legacy group resolver, cdev devices are self-contained — +/// each device has its own `/dev/vfio/devices/vfioN` fd and its own +/// iommufd fd. There's no shared container manager; each device gets +/// its own IOAS. +pub struct VfioCdevDeviceResolver { + dma_mapper_client: DmaMapperClient, +} + +impl VfioCdevDeviceResolver { + /// Create a new cdev resolver. + pub fn new(dma_mapper_client: DmaMapperClient) -> Self { + Self { dma_mapper_client } + } +} + +#[async_trait] +impl AsyncResolveResource for VfioCdevDeviceResolver { + type Output = ResolvedPciDevice; + type Error = anyhow::Error; + + async fn resolve( + &self, + _resolver: &ResourceResolver, + resource: VfioCdevDeviceHandle, + input: ResolvePciDeviceHandleParams<'_>, + ) -> Result { + let VfioCdevDeviceHandle { + pci_id, + cdev, + iommufd, + } = resource; + + tracing::info!(pci_id, "opening VFIO cdev device with iommufd"); + + let cdev_binding = crate::manager::VfioCdevBinding::new( + pci_id.clone(), + cdev, + iommufd, + &self.dma_mapper_client, + ) + .await + .context("failed to set up VFIO cdev + iommufd binding")?; + + let memory_mapper = input + .shared_mem_mapper + .context("memory mapper is required for VFIO device assignment")?; + + let device = VfioAssignedPciDevice::from_cdev( + cdev_binding, + pci_id, + input.register_mmio, + input.msi_target, + memory_mapper, + ) + .await?; + + Ok(device.into()) + } +} diff --git a/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs b/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs index e0b6875a20..3fede02576 100644 --- a/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs @@ -10,7 +10,7 @@ use std::fs::File; use vm_resource::ResourceId; use vm_resource::kind::PciDeviceHandleKind; -/// A handle to a VFIO-assigned PCI device. +/// A handle to a VFIO-assigned PCI device (legacy group path). /// /// The launcher opens the VFIO group file descriptor (e.g., `/dev/vfio/N`) /// and passes it here so that the VMM process does not need direct access @@ -26,3 +26,23 @@ pub struct VfioDeviceHandle { impl ResourceId for VfioDeviceHandle { const ID: &'static str = "vfio"; } + +/// A handle to a VFIO-assigned PCI device (cdev + iommufd path). +/// +/// The launcher opens the VFIO cdev file descriptor +/// (e.g., `/dev/vfio/devices/vfio0`) and the iommufd file descriptor +/// (`/dev/iommu`) and passes them here. The VMM binds the device to the +/// iommufd instance and attaches an IOAS for DMA mapping. +#[derive(MeshPayload)] +pub struct VfioCdevDeviceHandle { + /// PCI BDF address on the host (e.g., "0000:3f:7a.0"). + pub pci_id: String, + /// Pre-opened VFIO cdev file descriptor (`/dev/vfio/devices/vfioN`). + pub cdev: File, + /// Pre-opened iommufd file descriptor (`/dev/iommu`). + pub iommufd: File, +} + +impl ResourceId for VfioCdevDeviceHandle { + const ID: &'static str = "vfio-cdev"; +} diff --git a/vm/devices/user_driver/vfio_sys/src/cdev.rs b/vm/devices/user_driver/vfio_sys/src/cdev.rs new file mode 100644 index 0000000000..d6792bc415 --- /dev/null +++ b/vm/devices/user_driver/vfio_sys/src/cdev.rs @@ -0,0 +1,201 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VFIO cdev (per-device fd) support. +//! +//! VFIO cdev is the modern device-access interface (`/dev/vfio/devices/vfioN`) +//! that replaces the legacy group/container model. Each device gets its own +//! character device node. The device is bound to an iommufd instance via +//! `VFIO_DEVICE_BIND_IOMMUFD`, and DMA is configured by attaching an iommufd +//! IOAS or HWPT via `VFIO_DEVICE_ATTACH_IOMMUFD_PT`. +//! +//! Once bound and attached, the device fd supports the same `VFIO_DEVICE_*` +//! ioctls as the legacy group path (get_info, get_region_info, set_irqs, +//! reset, mmap). The [`CdevDevice`] type wraps the fd and provides these +//! operations, producing a [`super::Device`] for the common ioctl surface. + +use anyhow::Context as _; +use std::fs; +use std::os::unix::prelude::*; + +mod ioctl { + use nix::request_code_none; + use vfio_bindings::bindings::vfio::VFIO_BASE; + use vfio_bindings::bindings::vfio::VFIO_TYPE; + + // VFIO_DEVICE_BIND_IOMMUFD = _IO(VFIO_TYPE, VFIO_BASE + 18) + nix::ioctl_readwrite_bad!( + vfio_device_bind_iommufd, + request_code_none!(VFIO_TYPE, VFIO_BASE + 18), + super::VfioDeviceBindIommufd + ); + + // VFIO_DEVICE_ATTACH_IOMMUFD_PT = _IO(VFIO_TYPE, VFIO_BASE + 19) + nix::ioctl_readwrite_bad!( + vfio_device_attach_iommufd_pt, + request_code_none!(VFIO_TYPE, VFIO_BASE + 19), + super::VfioDeviceAttachIommufdPt + ); + + // VFIO_DEVICE_DETACH_IOMMUFD_PT = _IO(VFIO_TYPE, VFIO_BASE + 20) + nix::ioctl_readwrite_bad!( + vfio_device_detach_iommufd_pt, + request_code_none!(VFIO_TYPE, VFIO_BASE + 20), + super::VfioDeviceDetachIommufdPt + ); +} + +// Kernel ABI structs — must match `include/uapi/linux/vfio.h` exactly. + +#[repr(C)] +pub struct VfioDeviceBindIommufd { + pub argsz: u32, + pub flags: u32, + pub iommufd: i32, + pub out_devid: u32, +} + +#[repr(C)] +pub struct VfioDeviceAttachIommufdPt { + pub argsz: u32, + pub flags: u32, + pub pt_id: u32, +} + +#[repr(C)] +pub struct VfioDeviceDetachIommufdPt { + pub argsz: u32, + pub flags: u32, +} + +/// A VFIO device opened via the cdev interface (`/dev/vfio/devices/vfioN`). +/// +/// This is the modern per-device access path. After opening, the device must +/// be bound to an iommufd fd via [`bind_iommufd`](Self::bind_iommufd) and +/// then attached to an IOAS or HWPT via [`attach_ioas`](Self::attach_ioas) +/// before any DMA can occur. +/// +/// Once bound and attached, call [`into_device`](Self::into_device) to get +/// the standard [`Device`](super::Device) for config space, BAR, IRQ, and +/// mmap operations. +pub struct CdevDevice { + file: fs::File, +} + +impl CdevDevice { + /// Open a VFIO cdev device by its sysfs PCI address. + /// + /// Looks up the device's cdev node via + /// `/sys/bus/pci/devices//vfio-dev/` and opens the corresponding + /// `/dev/vfio/devices/vfioN` character device. + pub fn open(pci_id: &str) -> anyhow::Result { + let vfio_dev_dir = std::path::Path::new("/sys/bus/pci/devices") + .join(pci_id) + .join("vfio-dev"); + + // The vfio-dev/ directory contains a single entry like "vfio0" + let entry = fs::read_dir(&vfio_dev_dir) + .with_context(|| { + format!( + "failed to read {}: is the device bound to vfio-pci?", + vfio_dev_dir.display() + ) + })? + .next() + .context("no vfio-dev entry found")? + .context("failed to read vfio-dev entry")?; + + let dev_name = entry.file_name(); + let dev_path = std::path::Path::new("/dev/vfio/devices").join(&dev_name); + + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .open(&dev_path) + .with_context(|| format!("failed to open {}", dev_path.display()))?; + + Ok(Self { file }) + } + + /// Wrap a pre-opened VFIO cdev file descriptor. + pub fn from_file(file: fs::File) -> Self { + Self { file } + } + + /// Bind this device to an iommufd instance. + /// + /// Returns the kernel-assigned device ID within the iommufd context. + /// This must be called before any DMA operations. + pub fn bind_iommufd(&self, iommufd_fd: RawFd) -> anyhow::Result { + let mut cmd = VfioDeviceBindIommufd { + argsz: size_of::() as u32, + flags: 0, + iommufd: iommufd_fd, + out_devid: 0, + }; + // SAFETY: Both fds are valid, struct correctly constructed. + unsafe { + ioctl::vfio_device_bind_iommufd(self.file.as_raw_fd(), &mut cmd) + .context("VFIO_DEVICE_BIND_IOMMUFD failed")?; + } + Ok(cmd.out_devid) + } + + /// Attach the device to an IOAS or HWPT by its iommufd object ID. + /// + /// For Phase 4 (identity DMA), pass an IOAS ID. For Phase 5+ (nested + /// translation), pass a HWPT ID. + /// + /// Returns the attached page table ID (may differ from input if the + /// kernel auto-created a HWPT for the IOAS). + pub fn attach_ioas(&self, pt_id: u32) -> anyhow::Result { + let mut cmd = VfioDeviceAttachIommufdPt { + argsz: size_of::() as u32, + flags: 0, + pt_id, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::vfio_device_attach_iommufd_pt(self.file.as_raw_fd(), &mut cmd) + .context("VFIO_DEVICE_ATTACH_IOMMUFD_PT failed")?; + } + Ok(cmd.pt_id) + } + + /// Detach the device from its current IOAS/HWPT. + /// + /// After detaching, the device is in a blocking DMA state. + pub fn detach_ioas(&self) -> anyhow::Result<()> { + let mut cmd = VfioDeviceDetachIommufdPt { + argsz: size_of::() as u32, + flags: 0, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::vfio_device_detach_iommufd_pt(self.file.as_raw_fd(), &mut cmd) + .context("VFIO_DEVICE_DETACH_IOMMUFD_PT failed")?; + } + Ok(()) + } + + /// Convert to a standard [`Device`](super::Device) for config space, + /// BAR, IRQ, and mmap operations. + /// + /// The cdev fd supports the same `VFIO_DEVICE_*` ioctls as the legacy + /// group path, so the [`Device`](super::Device) type works unchanged. + pub fn into_device(self) -> super::Device { + super::Device { file: self.file } + } +} + +impl AsRef for CdevDevice { + fn as_ref(&self) -> &fs::File { + &self.file + } +} + +impl AsFd for CdevDevice { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} diff --git a/vm/devices/user_driver/vfio_sys/src/iommufd.rs b/vm/devices/user_driver/vfio_sys/src/iommufd.rs new file mode 100644 index 0000000000..8cf6bd6300 --- /dev/null +++ b/vm/devices/user_driver/vfio_sys/src/iommufd.rs @@ -0,0 +1,223 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Bindings for the Linux iommufd subsystem (`/dev/iommu`). +//! +//! Provides safe wrappers around `IOMMU_IOAS_ALLOC`, `IOMMU_IOAS_MAP`, +//! `IOMMU_IOAS_UNMAP`, and `IOMMU_DESTROY` ioctls. These are the Phase 4 +//! (identity DMA mapping) operations. Phase 5+ will add nested HWPT, +//! vIOMMU, vDevice, and vEVENTQ operations. + +use anyhow::Context as _; +use std::fs; +use std::os::unix::prelude::*; + +/// iommufd ioctl type character (';' = 0x3B). +const IOMMUFD_TYPE: u8 = b';'; + +/// Base command number for iommufd ioctls. +const IOMMUFD_CMD_BASE: u8 = 0x80; + +// Command numbers (IOMMUFD_CMD_BASE + offset). +const IOMMUFD_CMD_DESTROY: u8 = IOMMUFD_CMD_BASE; +const IOMMUFD_CMD_IOAS_ALLOC: u8 = IOMMUFD_CMD_BASE + 1; +const IOMMUFD_CMD_IOAS_MAP: u8 = IOMMUFD_CMD_BASE + 5; +const IOMMUFD_CMD_IOAS_UNMAP: u8 = IOMMUFD_CMD_BASE + 6; + +/// Flags for `IOMMU_IOAS_MAP`. +pub const IOMMU_IOAS_MAP_FIXED_IOVA: u32 = 1 << 0; +pub const IOMMU_IOAS_MAP_WRITEABLE: u32 = 1 << 1; +pub const IOMMU_IOAS_MAP_READABLE: u32 = 1 << 2; + +mod ioctl { + use nix::request_code_none; + + // IOMMUFD ioctls use _IO (no direction, just type + nr). + // The kernel defines them as _IO(IOMMUFD_TYPE, cmd_nr). + nix::ioctl_readwrite_bad!( + iommu_destroy, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_DESTROY as u32 + ), + super::IommuDestroy + ); + nix::ioctl_readwrite_bad!( + iommu_ioas_alloc, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_IOAS_ALLOC as u32 + ), + super::IommuIoasAlloc + ); + nix::ioctl_readwrite_bad!( + iommu_ioas_map, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_IOAS_MAP as u32 + ), + super::IommuIoasMap + ); + nix::ioctl_readwrite_bad!( + iommu_ioas_unmap, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_IOAS_UNMAP as u32 + ), + super::IommuIoasUnmap + ); +} + +// Kernel ABI structs — must match `include/uapi/linux/iommufd.h` exactly. + +#[repr(C)] +pub struct IommuDestroy { + pub size: u32, + pub id: u32, +} + +#[repr(C)] +pub struct IommuIoasAlloc { + pub size: u32, + pub flags: u32, + pub out_ioas_id: u32, +} + +#[repr(C)] +pub struct IommuIoasMap { + pub size: u32, + pub flags: u32, + pub ioas_id: u32, + pub __reserved: u32, + pub user_va: u64, + pub length: u64, + pub iova: u64, +} + +#[repr(C)] +pub struct IommuIoasUnmap { + pub size: u32, + pub ioas_id: u32, + pub iova: u64, + pub length: u64, +} + +/// An open iommufd file descriptor (`/dev/iommu`). +/// +/// Wraps the fd and provides safe methods for the iommufd ioctls needed +/// for Phase 4 (identity DMA mapping via IOAS). +pub struct IommufdCtx { + file: fs::File, +} + +impl IommufdCtx { + /// Open `/dev/iommu` and return a new iommufd context. + pub fn new() -> anyhow::Result { + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .open("/dev/iommu") + .context("failed to open /dev/iommu")?; + Ok(Self { file }) + } + + /// Wrap an existing iommufd file descriptor. + pub fn from_file(file: fs::File) -> Self { + Self { file } + } + + /// Allocate a new IO Address Space (IOAS). + /// + /// Returns the kernel-assigned IOAS object ID. + pub fn ioas_alloc(&self) -> anyhow::Result { + let mut cmd = IommuIoasAlloc { + size: size_of::() as u32, + flags: 0, + out_ioas_id: 0, + }; + // SAFETY: fd is valid, struct is correctly sized and zeroed. + unsafe { + ioctl::iommu_ioas_alloc(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_IOAS_ALLOC failed")?; + } + Ok(cmd.out_ioas_id) + } + + /// Map a user VA range into an IOAS at a fixed IOVA. + /// + /// `ioas_id` is the IOAS to map into. `iova` is the fixed IO virtual + /// address. `user_va` is the host virtual address of the backing memory. + /// `length` is the size in bytes (must be page-aligned). + /// + /// # Safety + /// `user_va` must point to valid, backed memory for `length` bytes. + /// The memory must remain mapped for the lifetime of this IOAS mapping. + pub unsafe fn ioas_map( + &self, + ioas_id: u32, + iova: u64, + user_va: u64, + length: u64, + ) -> anyhow::Result<()> { + let mut cmd = IommuIoasMap { + size: size_of::() as u32, + flags: IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_READABLE | IOMMU_IOAS_MAP_WRITEABLE, + ioas_id, + __reserved: 0, + user_va, + length, + iova, + }; + // SAFETY: fd is valid, struct correctly constructed. Caller + // guarantees user_va is backed and stable. + unsafe { + ioctl::iommu_ioas_map(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_IOAS_MAP failed")?; + } + Ok(()) + } + + /// Unmap an IOVA range from an IOAS. + /// + /// Returns the number of bytes actually unmapped. + pub fn ioas_unmap(&self, ioas_id: u32, iova: u64, length: u64) -> anyhow::Result { + let mut cmd = IommuIoasUnmap { + size: size_of::() as u32, + ioas_id, + iova, + length, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::iommu_ioas_unmap(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_IOAS_UNMAP failed")?; + } + Ok(cmd.length) + } + + /// Destroy an iommufd object by its ID. + pub fn destroy(&self, id: u32) -> anyhow::Result<()> { + let mut cmd = IommuDestroy { + size: size_of::() as u32, + id, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::iommu_destroy(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_DESTROY failed")?; + } + Ok(()) + } +} + +impl AsFd for IommufdCtx { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} + +impl AsRawFd for IommufdCtx { + fn as_raw_fd(&self) -> RawFd { + self.file.as_raw_fd() + } +} diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index 0af03092b3..ae887228ec 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -6,6 +6,9 @@ // UNSAFETY: Manual memory management with mmap and vfio ioctls. #![expect(unsafe_code)] +pub mod cdev; +pub mod iommufd; + use anyhow::Context; use bitfield_struct::bitfield; use headervec::HeaderVec; From 5797b79dcd8eb36cc9eabea77cb09b6aa115e44f Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 12 May 2026 16:47:34 -0700 Subject: [PATCH 02/10] share --- openvmm/openvmm_core/src/worker/dispatch.rs | 9 +- openvmm/openvmm_entry/src/lib.rs | 1 + .../pci/vfio_assigned_device/src/manager.rs | 379 +++++++++++++++--- .../pci/vfio_assigned_device/src/resolver.rs | 52 ++- .../vfio_assigned_device_resources/src/lib.rs | 3 + vm/vmcore/vm_topology/src/memory.rs | 36 ++ 6 files changed, 395 insertions(+), 85 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 6226cf2206..c4720a8fea 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -1968,9 +1968,12 @@ impl InitializedVm { >(vfio_resolver); // Register the VFIO cdev + iommufd resolver for devices opened - // via the cdev interface. - let cdev_resolver = - vfio_assigned_device::resolver::VfioCdevDeviceResolver::new(dma_mapper_client); + // via the cdev interface. Spawns a VfioCdevManager task that + // shares IOAS contexts across devices with the same --iommu ID. + let cdev_resolver = vfio_assigned_device::resolver::VfioCdevDeviceResolver::new( + driver_source.builder().build("vfio-cdev-mgr"), + dma_mapper_client, + ); resolver.add_async_resolver::< vm_resource::kind::PciDeviceHandleKind, _, diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 0a87c33a06..93bff58234 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -824,6 +824,7 @@ async fn vm_config_from_command_line( pci_id: cli_cfg.pci_id.clone(), cdev, iommufd, + iommu_id: iommu_id.clone(), } .into_resource(), }) diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 27a1f624de..1dadfed2fd 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -507,98 +507,336 @@ impl membacking::DmaTarget for IommufdDmaTarget { } } -/// Binding for a VFIO device opened via the cdev + iommufd path. -/// -/// Analogous to [`VfioDeviceBinding`] for the legacy group path, but -/// uses iommufd for DMA mapping instead of a VFIO type1 container. +/// Shared iommufd IOAS context for a single `--iommu` instance. /// -/// The device is bound to iommufd and attached to an IOAS. When this -/// binding is dropped, the device is automatically detached and the -/// iommufd binding is released (the kernel cleans up when the cdev fd -/// closes). -#[derive(Inspect)] -pub(crate) struct VfioCdevBinding { - /// PCI BDF address on the host. +/// All VFIO cdev devices referencing the same `--iommu id=` share +/// one IOAS — one set of IOMMU page tables, one DMA mapper registration. +/// This matches the legacy path's container-sharing behavior and QEMU's +/// iommufd backend model. +struct IoasEntry { + ctx: Arc, + ioas_id: u32, + /// Keeps the DMA mapper registered with the region manager. + _dma_handle: membacking::DmaMapperHandle, + /// Number of devices currently using this IOAS. + device_count: u64, +} + +/// Tracks a cdev device for inspect and cleanup. +struct CdevDeviceEntry { + id: u64, pci_id: String, - /// VFIO cdev device — provides config space, BAR, IRQ ioctls. - /// The cdev fd owns the iommufd binding (released on close). + iommu_id: String, +} + +/// RPC messages for the cdev manager task. +pub(crate) enum VfioCdevManagerRpc { + /// Bind a cdev device to an IOAS (creating the IOAS if this is the + /// first device for the given iommu ID). Returns the opened + /// `vfio_sys::Device` and metadata needed for `VfioCdevBindingState`. + PrepareDevice(FailableRpc), + /// Notify that a device has been dropped. + RemoveDevice(u64), + /// Inspect. + Inspect(inspect::Deferred), +} + +/// Request payload for `PrepareDevice`. +pub(crate) struct CdevPrepareRequest { + pub pci_id: String, + pub cdev: File, + pub iommufd: File, + pub iommu_id: String, +} + +/// Response payload for `PrepareDevice`. +pub(crate) struct CdevPrepareResponse { + pub device: vfio_sys::Device, + pub iommufd_devid: u32, + pub ioas_id: u32, + pub device_id: u64, +} + +/// Manages iommufd IOAS contexts and cdev device bindings. +/// +/// Analogous to [`VfioContainerManager`] for the legacy group path. +/// Shares a single IOAS (and DMA mapper registration) across all devices +/// referencing the same `--iommu` ID. +#[derive(InspectMut)] +#[inspect(extra = "Self::inspect_topology")] +pub(crate) struct VfioCdevManager { + /// IOAS contexts keyed by `--iommu` ID. #[inspect(skip)] - device: vfio_sys::Device, - /// iommufd device ID (from `VFIO_DEVICE_BIND_IOMMUFD`). - iommufd_devid: u32, - /// IOAS ID this device is attached to. - ioas_id: u32, - /// DMA mapper registration — removes the mapper on drop. + ioas_entries: HashMap, + /// Active devices. #[inspect(skip)] - _dma_handle: membacking::DmaMapperHandle, + devices: Vec, + /// Next device ID. + #[inspect(skip)] + next_device_id: u64, + /// DMA mapper client for registering new IOAS contexts. + #[inspect(skip)] + dma_mapper_client: DmaMapperClient, + #[inspect(skip)] + recv: mesh::Receiver, } -impl VfioCdevBinding { - /// Open a VFIO cdev device, bind to iommufd, allocate an IOAS, attach - /// the device, and register the IOAS as a DMA mapper. - pub async fn new( - pci_id: String, - cdev_file: File, - iommufd_file: File, - dma_mapper_client: &DmaMapperClient, - ) -> anyhow::Result { - let ctx = Arc::new(vfio_sys::iommufd::IommufdCtx::from_file(iommufd_file)); - let cdev = vfio_sys::cdev::CdevDevice::from_file(cdev_file); +/// Client handle for the [`VfioCdevManager`] task. +#[derive(Clone)] +pub struct VfioCdevManagerClient { + sender: mesh::Sender, +} + +impl Inspect for VfioCdevManagerClient { + fn inspect(&self, req: inspect::Request<'_>) { + self.sender.send(VfioCdevManagerRpc::Inspect(req.defer())); + } +} + +impl VfioCdevManagerClient { + pub(crate) async fn prepare_device( + &self, + req: CdevPrepareRequest, + ) -> anyhow::Result { + Ok(self + .sender + .call_failable(VfioCdevManagerRpc::PrepareDevice, req) + .await?) + } + + /// Returns a clone of the sender for passing to `VfioCdevBinding`. + pub(crate) fn sender(&self) -> mesh::Sender { + self.sender.clone() + } +} + +impl VfioCdevManager { + /// Create a new cdev manager. + pub fn new(dma_mapper_client: DmaMapperClient) -> Self { + Self { + ioas_entries: HashMap::new(), + devices: Vec::new(), + next_device_id: 0, + dma_mapper_client, + recv: mesh::Receiver::new(), + } + } + + /// Run the cdev manager task, processing RPCs until the channel closes. + pub async fn run(mut self) { + while let Ok(rpc) = self.recv.recv().await { + match rpc { + VfioCdevManagerRpc::PrepareDevice(rpc) => { + rpc.handle_failable(async |req| self.prepare_device(req).await) + .await + } + VfioCdevManagerRpc::RemoveDevice(device_id) => { + self.remove_device(device_id); + } + VfioCdevManagerRpc::Inspect(deferred) => deferred.inspect(&mut self), + } + } + } + + async fn prepare_device( + &mut self, + req: CdevPrepareRequest, + ) -> anyhow::Result { + let CdevPrepareRequest { + pci_id, + cdev: cdev_file, + iommufd: iommufd_file, + iommu_id, + } = req; + + tracing::info!(pci_id, iommu_id, "cdev manager: preparing device"); + + // Get or create the IOAS for this iommu ID. + if !self.ioas_entries.contains_key(&iommu_id) { + let ctx = Arc::new(vfio_sys::iommufd::IommufdCtx::from_file(iommufd_file)); + let ioas_id = ctx + .ioas_alloc() + .context("failed to allocate iommufd IOAS")?; + + let dma_target: Arc = Arc::new(IommufdDmaTarget { + ctx: ctx.clone(), + ioas_id, + }); + let dma_handle = self + .dma_mapper_client + .add_dma_mapper(dma_target, true) + .await + .context("failed to register iommufd IOAS with region manager")?; + + tracing::info!( + iommu_id, + ioas_id, + "created shared iommufd IOAS for iommu context" + ); + + self.ioas_entries.insert( + iommu_id.clone(), + IoasEntry { + ctx, + ioas_id, + _dma_handle: dma_handle, + device_count: 0, + }, + ); + } - // Allocate an IOAS for this device's DMA. - let ioas_id = ctx - .ioas_alloc() - .context("failed to allocate iommufd IOAS")?; + let entry = self.ioas_entries.get_mut(&iommu_id).unwrap(); + let cdev = vfio_sys::cdev::CdevDevice::from_file(cdev_file); - // Bind the cdev device to iommufd. + // Bind the cdev device to this iommu context's iommufd. let devid = cdev - .bind_iommufd(ctx.as_raw_fd()) + .bind_iommufd(entry.ctx.as_raw_fd()) .context("failed to bind VFIO cdev to iommufd")?; - // Attach the device to the IOAS. - cdev.attach_ioas(ioas_id) + // Attach the device to the shared IOAS. + cdev.attach_ioas(entry.ioas_id) .context("failed to attach cdev device to IOAS")?; + let device_id = self.next_device_id; + self.next_device_id += 1; + entry.device_count += 1; + + self.devices.push(CdevDeviceEntry { + id: device_id, + pci_id: pci_id.clone(), + iommu_id: iommu_id.clone(), + }); + tracing::info!( pci_id, + iommu_id, iommufd_devid = devid, - ioas_id, - "VFIO cdev device bound to iommufd" + ioas_id = entry.ioas_id, + device_id, + "VFIO cdev device attached to shared IOAS" ); - // Register the IOAS as a DMA target — the region manager will - // replay all existing guest RAM mappings into it. - let dma_target: Arc = Arc::new(IommufdDmaTarget { - ctx: ctx.clone(), - ioas_id, + Ok(CdevPrepareResponse { + device: cdev.into_device(), + iommufd_devid: devid, + ioas_id: entry.ioas_id, + device_id, + }) + } + + fn remove_device(&mut self, device_id: u64) { + if let Some(pos) = self.devices.iter().position(|d| d.id == device_id) { + let entry = self.devices.swap_remove(pos); + tracing::info!( + device_id, + pci_id = entry.pci_id, + iommu_id = entry.iommu_id, + "removing cdev device" + ); + + if let Some(ioas) = self.ioas_entries.get_mut(&entry.iommu_id) { + ioas.device_count -= 1; + if ioas.device_count == 0 { + tracing::info!( + iommu_id = entry.iommu_id, + "closing iommufd IOAS (no remaining devices)" + ); + self.ioas_entries.remove(&entry.iommu_id); + } + } + } + } + + fn inspect_topology(&self, resp: &mut inspect::Response<'_>) { + resp.child("ioas", |req| { + let mut resp = req.respond(); + for (iommu_id, entry) in &self.ioas_entries { + resp.child(iommu_id, |req| { + let mut resp = req.respond(); + resp.field("ioas_id", entry.ioas_id); + resp.field("device_count", entry.device_count); + resp.child("device", |req| { + let mut resp = req.respond(); + for dev in &self.devices { + if dev.iommu_id == *iommu_id { + resp.field(&dev.pci_id, ()); + } + } + }); + }); + } }); - let dma_handle = dma_mapper_client - .add_dma_mapper(dma_target, true) - .await - .context("failed to register iommufd IOAS with region manager")?; + } - let device = cdev.into_device(); + pub(crate) fn client(&mut self) -> VfioCdevManagerClient { + VfioCdevManagerClient { + sender: self.recv.sender(), + } + } +} + +/// Binding for a VFIO device opened via the cdev + iommufd path. +/// +/// Analogous to [`VfioDeviceBinding`] for the legacy group path. +/// Notifies the cdev manager on drop so device counts stay accurate +/// and IOAS contexts are cleaned up when the last device is removed. +#[derive(Inspect)] +pub(crate) struct VfioCdevBinding { + /// PCI BDF address on the host. + pci_id: String, + /// VFIO cdev device — provides config space, BAR, IRQ ioctls. + #[inspect(skip)] + device: vfio_sys::Device, + /// iommufd device ID (from `VFIO_DEVICE_BIND_IOMMUFD`). + iommufd_devid: u32, + /// IOAS ID this device is attached to. + ioas_id: u32, + /// Device ID assigned by the manager (for drop notification). + #[inspect(skip)] + device_id: u64, + /// Sender to notify the manager on drop. + #[inspect(skip)] + sender: mesh::Sender, +} - Ok(Self { +impl VfioCdevBinding { + /// Create from a manager response. + pub(crate) fn from_response( + resp: CdevPrepareResponse, + pci_id: String, + sender: mesh::Sender, + ) -> Self { + Self { pci_id, - device, - iommufd_devid: devid, - ioas_id, - _dma_handle: dma_handle, - }) + device: resp.device, + iommufd_devid: resp.iommufd_devid, + ioas_id: resp.ioas_id, + device_id: resp.device_id, + sender, + } } /// Consume the binding and split into the `Device` (for constructing /// `VfioAssignedPciDevice`) and the remaining binding state (for - /// lifetime management). + /// lifetime management). The state's `Drop` impl notifies the manager + /// when the device is released. pub fn into_parts(self) -> (vfio_sys::Device, VfioCdevBindingState) { + let Self { + pci_id, + device, + iommufd_devid, + ioas_id, + device_id, + sender, + } = self; ( - self.device, + device, VfioCdevBindingState { - _pci_id: self.pci_id, - _iommufd_devid: self.iommufd_devid, - _ioas_id: self.ioas_id, - _dma_handle: self._dma_handle, + _pci_id: pci_id, + _iommufd_devid: iommufd_devid, + _ioas_id: ioas_id, + _device_id: device_id, + _sender: sender, }, ) } @@ -607,15 +845,24 @@ impl VfioCdevBinding { /// The iommufd-related state from a [`VfioCdevBinding`], kept alive for /// the lifetime of the assigned device. /// -/// Dropping this removes the DMA mapper from the region manager and -/// allows the kernel to clean up iommufd objects. +/// Notifies the cdev manager on drop so device counts and IOAS contexts +/// are cleaned up. #[derive(Inspect)] pub(crate) struct VfioCdevBindingState { _pci_id: String, _iommufd_devid: u32, _ioas_id: u32, #[inspect(skip)] - _dma_handle: membacking::DmaMapperHandle, + _device_id: u64, + #[inspect(skip)] + _sender: mesh::Sender, +} + +impl Drop for VfioCdevBindingState { + fn drop(&mut self) { + self._sender + .send(VfioCdevManagerRpc::RemoveDevice(self._device_id)); + } } /// Wrapper enum for either legacy group or cdev iommufd binding. diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index f994f3de80..a21ca829d3 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -91,18 +91,29 @@ impl AsyncResolveResource for VfioDeviceR /// Resource resolver for [`VfioCdevDeviceHandle`] (cdev + iommufd path). /// -/// Unlike the legacy group resolver, cdev devices are self-contained — -/// each device has its own `/dev/vfio/devices/vfioN` fd and its own -/// iommufd fd. There's no shared container manager; each device gets -/// its own IOAS. +/// Spawns a [`VfioCdevManager`](crate::manager::VfioCdevManager) task +/// internally and communicates with it via RPC to share IOAS contexts +/// across devices referencing the same `--iommu` ID. pub struct VfioCdevDeviceResolver { - dma_mapper_client: DmaMapperClient, + client: crate::manager::VfioCdevManagerClient, + _task: pal_async::task::Task<()>, } impl VfioCdevDeviceResolver { - /// Create a new cdev resolver. - pub fn new(dma_mapper_client: DmaMapperClient) -> Self { - Self { dma_mapper_client } + /// Create a new cdev resolver, spawning the cdev manager task. + pub fn new(spawner: impl pal_async::task::Spawn, dma_mapper_client: DmaMapperClient) -> Self { + let mut manager = crate::manager::VfioCdevManager::new(dma_mapper_client); + let client = manager.client(); + let task = spawner.spawn("vfio-cdev-mgr", manager.run()); + Self { + client, + _task: task, + } + } + + /// Returns a handle for the VM's inspect tree. + pub fn inspect_handle(&self) -> crate::manager::VfioCdevManagerClient { + self.client.clone() } } @@ -121,18 +132,27 @@ impl AsyncResolveResource for VfioCde pci_id, cdev, iommufd, + iommu_id, } = resource; - tracing::info!(pci_id, "opening VFIO cdev device with iommufd"); + tracing::info!(pci_id, iommu_id, "opening VFIO cdev device with iommufd"); + + let resp = self + .client + .prepare_device(crate::manager::CdevPrepareRequest { + pci_id: pci_id.clone(), + cdev, + iommufd, + iommu_id, + }) + .await + .context("VFIO cdev manager failed")?; - let cdev_binding = crate::manager::VfioCdevBinding::new( + let cdev_binding = crate::manager::VfioCdevBinding::from_response( + resp, pci_id.clone(), - cdev, - iommufd, - &self.dma_mapper_client, - ) - .await - .context("failed to set up VFIO cdev + iommufd binding")?; + self.client.sender(), + ); let memory_mapper = input .shared_mem_mapper diff --git a/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs b/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs index 3fede02576..744e4818fe 100644 --- a/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs @@ -41,6 +41,9 @@ pub struct VfioCdevDeviceHandle { pub cdev: File, /// Pre-opened iommufd file descriptor (`/dev/iommu`). pub iommufd: File, + /// The `--iommu` context ID this device belongs to. All devices + /// sharing the same ID share a single IOAS (one set of page tables). + pub iommu_id: String, } impl ResourceId for VfioCdevDeviceHandle { diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index bf0fe05ea6..0bab55be39 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -246,6 +246,42 @@ impl MemoryLayout { } } + // On aarch64, the physical IOMMU reserves 128MB..129MB for the MSI + // doorbell window. iommufd inherits this and rejects DMA mappings + // that overlap it. Split any RAM range that crosses this window so + // that the region manager never maps it. + // + // TODO: query the actual reserved ranges from iommufd at runtime + // via `IOMMU_IOAS_IOVA_RANGES` instead of hardcoding. + #[cfg(guest_arch = "aarch64")] + { + const IOMMU_MSI_RESERVED: MemoryRange = MemoryRange::new(0x800_0000..0x810_0000); + let mut split_ram = Vec::with_capacity(ram.len() + 2); + for entry in ram { + if !entry.range.overlaps(&IOMMU_MSI_RESERVED) { + split_ram.push(entry); + } else { + // Part before the reserved window. + if entry.range.start() < IOMMU_MSI_RESERVED.start() { + split_ram.push(MemoryRangeWithNode { + range: MemoryRange::new( + entry.range.start()..IOMMU_MSI_RESERVED.start(), + ), + vnode: entry.vnode, + }); + } + // Part after the reserved window. + if entry.range.end() > IOMMU_MSI_RESERVED.end() { + split_ram.push(MemoryRangeWithNode { + range: MemoryRange::new(IOMMU_MSI_RESERVED.end()..entry.range.end()), + vnode: entry.vnode, + }); + } + } + } + ram = split_ram; + } + Self::build( ram, mmio_gaps.to_vec(), From 71774ed3121afbeb09b2260462641316c1e6378f Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 13 May 2026 19:59:29 +0000 Subject: [PATCH 03/10] feedback --- Guide/src/reference/openvmm/management/cli.md | 8 +- Guide/src/user_guide/openvmm/vfio.md | 41 +++++-- openvmm/openvmm_entry/src/cli_args.rs | 103 +++++++++++++++--- vm/devices/user_driver/vfio_sys/src/cdev.rs | 38 +------ .../user_driver/vfio_sys/src/iommufd.rs | 7 +- 5 files changed, 133 insertions(+), 64 deletions(-) diff --git a/Guide/src/reference/openvmm/management/cli.md b/Guide/src/reference/openvmm/management/cli.md index 4e9a2058c2..195c9444b7 100644 --- a/Guide/src/reference/openvmm/management/cli.md +++ b/Guide/src/reference/openvmm/management/cli.md @@ -215,8 +215,12 @@ For `--virtio-rng` and `--virtio-console`, use their separate PCIe port flags: --vhost-user /tmp/virtiofsd.sock,type=fs,tag=myfs,pcie_port=rp0 ``` -**VFIO device assignment** (Linux only): `--vfio` +**VFIO device assignment** (Linux only): `--vfio` (and optional `--iommu`) ```sh ---vfio rp0:0000:01:00.0 +# Legacy VFIO group/container path: +--vfio host=0000:01:00.0,port=rp0 + +# Modern VFIO cdev + iommufd path (Linux >= 6.6): +--iommu id=iommu0 --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 ``` diff --git a/Guide/src/user_guide/openvmm/vfio.md b/Guide/src/user_guide/openvmm/vfio.md index 3a6b59f43d..5755dd283e 100644 --- a/Guide/src/user_guide/openvmm/vfio.md +++ b/Guide/src/user_guide/openvmm/vfio.md @@ -104,7 +104,7 @@ Use the `--vfio` flag to assign the device to a PCIe root port. You also need to sudo openvmm \ --pcie-root-complex rc0 \ --pcie-root-port rc0:rp0 \ - --vfio rp0:0000:01:00.0 \ + --vfio host=0000:01:00.0,port=rp0 \ --kernel /path/to/vmlinux \ --initrd /path/to/initrd \ --cmdline "console=ttyS0" \ @@ -113,20 +113,47 @@ sudo openvmm \ --processors 2 ``` -The `--vfio` syntax is `:`: +The `--vfio` value is a comma-separated list of `key=value` pairs: -- `rp0` — the name of the PCIe root port to attach the device to (must match a `--pcie-root-port` name) -- `0000:01:00.0` — the PCI BDF of the VFIO device on the host +- `host=` (required) — the PCI BDF of the VFIO device on the host (e.g., `0000:01:00.0`) +- `port=` (required) — the name of the PCIe root port to attach the device to (must match a `--pcie-root-port` name) +- `iommu=` (optional) — reference to an `--iommu` context; see [Using iommufd (cdev path)](#using-iommufd-cdev-path) below ```admonish tip You can assign multiple devices by adding more root ports and `--vfio` flags: --pcie-root-port rc0:rp0 \ --pcie-root-port rc0:rp1 \ - --vfio rp0:0000:01:00.0 \ - --vfio rp1:334c:00:00.0 + --vfio host=0000:01:00.0,port=rp0 \ + --vfio host=334c:00:00.0,port=rp1 ``` +### Using iommufd (cdev path) + +By default, `--vfio` uses the legacy VFIO group/container interface with the +Type1v2 IOMMU driver. On hosts with Linux kernel 6.6 or newer, OpenVMM can +instead use the modern VFIO cdev (per-device fd) + iommufd interface. Enable +it by declaring an `--iommu` context and referencing it from each `--vfio` +device with the `iommu=` key: + +```bash +sudo openvmm \ + --pcie-root-complex rc0 \ + --pcie-root-port rc0:rp0 \ + --iommu id=iommu0 \ + --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 \ + ... +``` + +The `--iommu` syntax is `id=`. All `--vfio` devices that reference the +same `id` share a single iommufd IOAS (one set of IOMMU page tables and one +DMA mapper registration). The IOAS is allocated on demand the first time a +device referencing the id is opened. + +Devices opened via the cdev path read their device node from +`/sys/bus/pci/devices//vfio-dev/vfioN` and open +`/dev/vfio/devices/vfioN` instead of `/dev/vfio/`. + ## Step 6: Verify in the guest If the guest boots with PCI support, the assigned device should be visible: @@ -164,7 +191,7 @@ Then request hugepage-backed RAM with the `--memory` option: sudo openvmm \ --pcie-root-complex rc0 \ --pcie-root-port rc0:rp0 \ - --vfio rp0:0000:01:00.0 \ + --vfio host=0000:01:00.0,port=rp0 \ --kernel /path/to/vmlinux \ --initrd /path/to/initrd \ --cmdline "console=ttyS0" \ diff --git a/openvmm/openvmm_entry/src/cli_args.rs b/openvmm/openvmm_entry/src/cli_args.rs index e76d1e5cf8..1a133976d5 100644 --- a/openvmm/openvmm_entry/src/cli_args.rs +++ b/openvmm/openvmm_entry/src/cli_args.rs @@ -919,8 +919,9 @@ Keys: /// Create an iommufd context for VFIO cdev device assignment #[clap(long_help = r#" -Create an iommufd context. Opens /dev/iommu and allocates an IOAS. -Referenced by --vfio devices via the iommu= key. +Declare an iommufd context. Opens /dev/iommu so it can be referenced by +--vfio devices via the iommu= key. The associated IOAS is allocated +the first time a --vfio device referring to this id is opened. Requires Linux kernel >= 6.6 with iommufd support. @@ -2462,18 +2463,36 @@ impl FromStr for VfioDeviceCli { type Err = anyhow::Error; fn from_str(s: &str) -> Result { - let mut host = None; - let mut port = None; - let mut iommu = None; + let mut host: Option = None; + let mut port: Option = None; + let mut iommu: Option = None; for kv in s.split(',') { let (key, value) = kv .split_once('=') .context("expected key=value pair (e.g., host=0000:01:00.0,port=rp0)")?; + if value.is_empty() { + anyhow::bail!("--vfio: '{key}=' value cannot be empty"); + } match key { - "host" => host = Some(value.to_string()), - "port" => port = Some(value.to_string()), - "iommu" => iommu = Some(value.to_string()), + "host" => { + if host.is_some() { + anyhow::bail!("duplicate --vfio key: 'host'"); + } + host = Some(value.to_string()); + } + "port" => { + if port.is_some() { + anyhow::bail!("duplicate --vfio key: 'port'"); + } + port = Some(value.to_string()); + } + "iommu" => { + if iommu.is_some() { + anyhow::bail!("duplicate --vfio key: 'iommu'"); + } + iommu = Some(value.to_string()); + } _ => anyhow::bail!("unknown --vfio key: '{key}'"), } } @@ -2481,13 +2500,6 @@ impl FromStr for VfioDeviceCli { let pci_id = host.context("--vfio: 'host=' is required")?; let port_name = port.context("--vfio: 'port=' is required")?; - if pci_id.is_empty() { - anyhow::bail!("host PCI address cannot be empty"); - } - if port_name.is_empty() { - anyhow::bail!("port name cannot be empty"); - } - // Reject path separators to prevent sysfs path traversal via Path::join. if pci_id.contains('/') || pci_id.contains("..") { anyhow::bail!("PCI address must not contain path separators"); @@ -3900,4 +3912,65 @@ mod tests { let opt = Options::try_parse_from(["openvmm", "--pidfile", "/tmp/test.pid"]).unwrap(); assert_eq!(opt.pidfile, Some(PathBuf::from("/tmp/test.pid"))); } + + #[cfg(target_os = "linux")] + #[test] + fn test_vfio_device_cli_parse() { + // Required keys only. + let v = VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0").unwrap(); + assert_eq!(v.pci_id, "0000:01:00.0"); + assert_eq!(v.port_name, "rp0"); + assert_eq!(v.iommu, None); + + // With optional iommu= key. Keys may appear in any order. + let v = VfioDeviceCli::from_str("port=rp1,iommu=iommu0,host=0000:02:00.0").unwrap(); + assert_eq!(v.pci_id, "0000:02:00.0"); + assert_eq!(v.port_name, "rp1"); + assert_eq!(v.iommu.as_deref(), Some("iommu0")); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_vfio_device_cli_errors() { + // Missing required keys. + assert!(VfioDeviceCli::from_str("port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0").is_err()); + + // Unknown key. + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,foo=bar").is_err()); + + // Duplicate keys are rejected. + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,host=0000:02:00.0,port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,port=rp1").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,iommu=a,iommu=b").is_err()); + + // Empty values are rejected. + assert!(VfioDeviceCli::from_str("host=,port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,iommu=").is_err()); + + // Missing '=' separator. + assert!(VfioDeviceCli::from_str("host").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,iommu").is_err()); + + // Path-traversal characters in the host BDF are rejected. + assert!(VfioDeviceCli::from_str("host=../../etc/passwd,port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=foo/bar,port=rp0").is_err()); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_iommu_cli_parse() { + let c = IommuCli::from_str("id=iommu0").unwrap(); + assert_eq!(c.id, "iommu0"); + + // Wrong key. + assert!(IommuCli::from_str("name=iommu0").is_err()); + + // Missing '=' separator. + assert!(IommuCli::from_str("iommu0").is_err()); + + // Empty id. + assert!(IommuCli::from_str("id=").is_err()); + } } diff --git a/vm/devices/user_driver/vfio_sys/src/cdev.rs b/vm/devices/user_driver/vfio_sys/src/cdev.rs index d6792bc415..c8e8987199 100644 --- a/vm/devices/user_driver/vfio_sys/src/cdev.rs +++ b/vm/devices/user_driver/vfio_sys/src/cdev.rs @@ -83,40 +83,6 @@ pub struct CdevDevice { } impl CdevDevice { - /// Open a VFIO cdev device by its sysfs PCI address. - /// - /// Looks up the device's cdev node via - /// `/sys/bus/pci/devices//vfio-dev/` and opens the corresponding - /// `/dev/vfio/devices/vfioN` character device. - pub fn open(pci_id: &str) -> anyhow::Result { - let vfio_dev_dir = std::path::Path::new("/sys/bus/pci/devices") - .join(pci_id) - .join("vfio-dev"); - - // The vfio-dev/ directory contains a single entry like "vfio0" - let entry = fs::read_dir(&vfio_dev_dir) - .with_context(|| { - format!( - "failed to read {}: is the device bound to vfio-pci?", - vfio_dev_dir.display() - ) - })? - .next() - .context("no vfio-dev entry found")? - .context("failed to read vfio-dev entry")?; - - let dev_name = entry.file_name(); - let dev_path = std::path::Path::new("/dev/vfio/devices").join(&dev_name); - - let file = fs::OpenOptions::new() - .read(true) - .write(true) - .open(&dev_path) - .with_context(|| format!("failed to open {}", dev_path.display()))?; - - Ok(Self { file }) - } - /// Wrap a pre-opened VFIO cdev file descriptor. pub fn from_file(file: fs::File) -> Self { Self { file } @@ -143,8 +109,8 @@ impl CdevDevice { /// Attach the device to an IOAS or HWPT by its iommufd object ID. /// - /// For Phase 4 (identity DMA), pass an IOAS ID. For Phase 5+ (nested - /// translation), pass a HWPT ID. + /// Pass an IOAS ID for identity DMA translation, or a HWPT ID for + /// nested translation. /// /// Returns the attached page table ID (may differ from input if the /// kernel auto-created a HWPT for the IOAS). diff --git a/vm/devices/user_driver/vfio_sys/src/iommufd.rs b/vm/devices/user_driver/vfio_sys/src/iommufd.rs index 8cf6bd6300..56cc0643f0 100644 --- a/vm/devices/user_driver/vfio_sys/src/iommufd.rs +++ b/vm/devices/user_driver/vfio_sys/src/iommufd.rs @@ -4,9 +4,8 @@ //! Bindings for the Linux iommufd subsystem (`/dev/iommu`). //! //! Provides safe wrappers around `IOMMU_IOAS_ALLOC`, `IOMMU_IOAS_MAP`, -//! `IOMMU_IOAS_UNMAP`, and `IOMMU_DESTROY` ioctls. These are the Phase 4 -//! (identity DMA mapping) operations. Phase 5+ will add nested HWPT, -//! vIOMMU, vDevice, and vEVENTQ operations. +//! `IOMMU_IOAS_UNMAP`, and `IOMMU_DESTROY` ioctls, which together support +//! identity DMA mapping via an IOAS. use anyhow::Context as _; use std::fs; @@ -105,7 +104,7 @@ pub struct IommuIoasUnmap { /// An open iommufd file descriptor (`/dev/iommu`). /// /// Wraps the fd and provides safe methods for the iommufd ioctls needed -/// for Phase 4 (identity DMA mapping via IOAS). +/// to allocate an IOAS and map/unmap host memory into it. pub struct IommufdCtx { file: fs::File, } From 85087ac33a6ffa0bf3069ceb5b382b68138c6f0f Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 13 May 2026 20:48:24 +0000 Subject: [PATCH 04/10] vfio: drop bogus _ prefix on VfioCdevBindingState fields The fields were prefixed with _ as if unused, but the inspected fields appear in inspect output and device_id/sender are explicitly read in the Drop impl. Drop the prefixes so the names reflect actual use. --- .../pci/vfio_assigned_device/src/manager.rs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 1dadfed2fd..23dbe7b56f 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -832,11 +832,11 @@ impl VfioCdevBinding { ( device, VfioCdevBindingState { - _pci_id: pci_id, - _iommufd_devid: iommufd_devid, - _ioas_id: ioas_id, - _device_id: device_id, - _sender: sender, + pci_id, + iommufd_devid, + ioas_id, + device_id, + sender, }, ) } @@ -849,19 +849,19 @@ impl VfioCdevBinding { /// are cleaned up. #[derive(Inspect)] pub(crate) struct VfioCdevBindingState { - _pci_id: String, - _iommufd_devid: u32, - _ioas_id: u32, + pci_id: String, + iommufd_devid: u32, + ioas_id: u32, #[inspect(skip)] - _device_id: u64, + device_id: u64, #[inspect(skip)] - _sender: mesh::Sender, + sender: mesh::Sender, } impl Drop for VfioCdevBindingState { fn drop(&mut self) { - self._sender - .send(VfioCdevManagerRpc::RemoveDevice(self._device_id)); + self.sender + .send(VfioCdevManagerRpc::RemoveDevice(self.device_id)); } } From a5e630b95b209a1e18ac8e22c6629821bdcd03c4 Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 13 May 2026 22:52:30 -0700 Subject: [PATCH 05/10] improvements? --- openvmm/openvmm_core/src/worker/dispatch.rs | 13 +- .../pci/vfio_assigned_device/src/manager.rs | 460 ++++++++++-------- .../pci/vfio_assigned_device/src/resolver.rs | 21 +- vm/vmcore/vm_topology/src/memory.rs | 36 -- 4 files changed, 289 insertions(+), 241 deletions(-) diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index c4720a8fea..1f89cc88a1 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -713,6 +713,9 @@ struct LoadedVmInner { /// VFIO container manager inspect handle (Linux only). #[cfg(target_os = "linux")] vfio_inspect: Option, + /// VFIO cdev + iommufd manager inspect handle (Linux only). + #[cfg(target_os = "linux")] + vfio_cdev_inspect: Option, // relay halt messages, intercepting reset if configured. halt_recv: mesh::Receiver, @@ -1953,7 +1956,7 @@ impl InitializedVm { // Register the VFIO resolver, which spawns a container manager task // internally to share containers across assigned devices. #[cfg(target_os = "linux")] - let vfio_inspect = { + let (vfio_inspect, vfio_cdev_inspect) = { let dma_mapper_client = memory_manager.dma_mapper_client(); let vfio_resolver = vfio_assigned_device::resolver::VfioDeviceResolver::new( driver_source.builder().build("vfio-container-mgr"), @@ -1974,6 +1977,7 @@ impl InitializedVm { driver_source.builder().build("vfio-cdev-mgr"), dma_mapper_client, ); + let cdev_handle = cdev_resolver.inspect_handle(); resolver.add_async_resolver::< vm_resource::kind::PciDeviceHandleKind, _, @@ -1981,7 +1985,7 @@ impl InitializedVm { _, >(cdev_resolver); - Some(handle) + (Some(handle), Some(cdev_handle)) }; // Resolve PCIe devices concurrently. @@ -2561,6 +2565,8 @@ impl InitializedVm { vmgs_client_inspect_handle, #[cfg(target_os = "linux")] vfio_inspect, + #[cfg(target_os = "linux")] + vfio_cdev_inspect, halt_recv, client_notify_send, automatic_guest_reset: cfg.automatic_guest_reset, @@ -2953,7 +2959,8 @@ impl LoadedVm { .field("resolver", &self.inner.resolver) .field("vmgs", &self.inner.vmgs_client_inspect_handle); #[cfg(target_os = "linux")] - resp.field("vfio", &self.inner.vfio_inspect); + resp.field("vfio", &self.inner.vfio_inspect) + .field("vfio_cdev", &self.inner.vfio_cdev_inspect); }), }, Event::VmRpc(Err(_)) => break, diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 23dbe7b56f..7584f51a13 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -15,6 +15,7 @@ use inspect::{Inspect, InspectMut}; use membacking::DmaMapperClient; use mesh::rpc::FailableRpc; use mesh::rpc::RpcSend as _; +use pal_async::task::Spawn as _; use std::collections::HashMap; use std::fs::File; use std::os::unix::prelude::*; @@ -507,36 +508,182 @@ impl membacking::DmaTarget for IommufdDmaTarget { } } -/// Shared iommufd IOAS context for a single `--iommu` instance. +// --- Per-iommu-context manager (IoasManager) --- + +/// RPC messages for a per-iommu [`IoasManager`] task. +pub(crate) enum IoasManagerRpc { + /// Bind and attach a cdev device to this manager's IOAS. + PrepareDevice { + pci_id: String, + cdev: File, + /// The response half of the original RPC from the resolver. + respond: FailableRpc<(), CdevPrepareResponse>, + }, + /// Notify that a device has been dropped. + RemoveDevice(u64), + /// Inspect. + Inspect(inspect::Deferred), +} + +/// Manages a single iommufd IOAS context for one `--iommu` instance. /// -/// All VFIO cdev devices referencing the same `--iommu id=` share -/// one IOAS — one set of IOMMU page tables, one DMA mapper registration. -/// This matches the legacy path's container-sharing behavior and QEMU's -/// iommufd backend model. -struct IoasEntry { +/// Each `--iommu id=` gets its own `IoasManager` task, which owns +/// the iommufd context, IOAS, and DMA mapper registration. Devices +/// referencing the same `--iommu` ID share one IOAS — one set of IOMMU +/// page tables, one DMA mapper registration. Devices on different +/// `--iommu` IDs are handled by separate `IoasManager` tasks concurrently. +struct IoasManager { + iommu_id: String, ctx: Arc, ioas_id: u32, /// Keeps the DMA mapper registered with the region manager. _dma_handle: membacking::DmaMapperHandle, - /// Number of devices currently using this IOAS. - device_count: u64, + /// Active devices on this IOAS. + devices: Vec, + /// Next device ID (unique within this manager). + next_device_id: u64, + recv: mesh::Receiver, } /// Tracks a cdev device for inspect and cleanup. struct CdevDeviceEntry { id: u64, pci_id: String, - iommu_id: String, } -/// RPC messages for the cdev manager task. +impl IoasManager { + /// Create and initialize a new per-iommu manager. + /// + /// Allocates an IOAS on the given iommufd fd and registers it with + /// the region manager for DMA mapping. + async fn new( + iommu_id: String, + iommufd: File, + dma_mapper_client: &DmaMapperClient, + recv: mesh::Receiver, + ) -> anyhow::Result { + let ctx = Arc::new(vfio_sys::iommufd::IommufdCtx::from_file(iommufd)); + let ioas_id = ctx + .ioas_alloc() + .context("failed to allocate iommufd IOAS")?; + + let dma_target: Arc = Arc::new(IommufdDmaTarget { + ctx: ctx.clone(), + ioas_id, + }); + let dma_handle = dma_mapper_client + .add_dma_mapper(dma_target, true) + .await + .context("failed to register iommufd IOAS with region manager")?; + + tracing::info!(iommu_id, ioas_id, "created iommufd IOAS for iommu context"); + + Ok(Self { + iommu_id, + ctx, + ioas_id, + _dma_handle: dma_handle, + devices: Vec::new(), + next_device_id: 0, + recv, + }) + } + + /// Run the per-iommu manager task, processing RPCs until the channel + /// closes. + async fn run(mut self) { + while let Ok(rpc) = self.recv.recv().await { + match rpc { + IoasManagerRpc::PrepareDevice { + pci_id, + cdev, + respond, + } => { + respond + .handle_failable(async |()| self.prepare_device(pci_id, cdev)) + .await + } + IoasManagerRpc::RemoveDevice(device_id) => { + self.remove_device(device_id); + } + IoasManagerRpc::Inspect(deferred) => deferred.inspect(&self), + } + } + } + + fn prepare_device( + &mut self, + pci_id: String, + cdev_file: File, + ) -> anyhow::Result { + let cdev = vfio_sys::cdev::CdevDevice::from_file(cdev_file); + + // Bind the cdev device to this iommu context's iommufd. + let devid = cdev + .bind_iommufd(self.ctx.as_raw_fd()) + .context("failed to bind VFIO cdev to iommufd")?; + + // Attach the device to the shared IOAS. + cdev.attach_ioas(self.ioas_id) + .context("failed to attach cdev device to IOAS")?; + + let device_id = self.next_device_id; + self.next_device_id += 1; + + self.devices.push(CdevDeviceEntry { + id: device_id, + pci_id: pci_id.clone(), + }); + + tracing::info!( + pci_id, + iommu_id = self.iommu_id, + iommufd_devid = devid, + ioas_id = self.ioas_id, + device_id, + "VFIO cdev device attached to IOAS" + ); + + Ok(CdevPrepareResponse { + device: cdev.into_device(), + iommufd_devid: devid, + ioas_id: self.ioas_id, + device_id, + manager_send: self.recv.sender(), + }) + } + + fn remove_device(&mut self, device_id: u64) { + if let Some(pos) = self.devices.iter().position(|d| d.id == device_id) { + let entry = self.devices.swap_remove(pos); + tracing::info!( + device_id, + pci_id = entry.pci_id, + iommu_id = self.iommu_id, + "removing cdev device" + ); + } + } +} + +impl Inspect for IoasManager { + fn inspect(&self, req: inspect::Request<'_>) { + let mut resp = req.respond(); + resp.field("ioas_id", self.ioas_id); + resp.field("device_count", self.devices.len()); + for dev in &self.devices { + resp.field(&dev.pci_id, ()); + } + } +} + +// --- Cdev dispatcher (VfioCdevManager) --- + +/// RPC messages for the cdev dispatcher. pub(crate) enum VfioCdevManagerRpc { - /// Bind a cdev device to an IOAS (creating the IOAS if this is the - /// first device for the given iommu ID). Returns the opened - /// `vfio_sys::Device` and metadata needed for `VfioCdevBindingState`. + /// Bind a cdev device to an IOAS, spawning a per-iommu manager if + /// this is the first device for the given iommu ID. PrepareDevice(FailableRpc), - /// Notify that a device has been dropped. - RemoveDevice(u64), /// Inspect. Inspect(inspect::Deferred), } @@ -555,33 +702,30 @@ pub(crate) struct CdevPrepareResponse { pub iommufd_devid: u32, pub ioas_id: u32, pub device_id: u64, + /// Sender to the per-iommu manager for drop notification. + pub manager_send: mesh::Sender, } -/// Manages iommufd IOAS contexts and cdev device bindings. +/// Dispatches cdev device requests to per-iommu [`IoasManager`] tasks. /// -/// Analogous to [`VfioContainerManager`] for the legacy group path. -/// Shares a single IOAS (and DMA mapper registration) across all devices -/// referencing the same `--iommu` ID. -#[derive(InspectMut)] -#[inspect(extra = "Self::inspect_topology")] +/// Unlike the legacy [`VfioContainerManager`] which makes cross-device +/// sharing decisions, the cdev dispatcher simply routes each device to +/// the manager for its `--iommu` ID. Each per-iommu manager runs as a +/// separate task, so devices on different `--iommu` contexts are +/// prepared concurrently. pub(crate) struct VfioCdevManager { - /// IOAS contexts keyed by `--iommu` ID. - #[inspect(skip)] - ioas_entries: HashMap, - /// Active devices. - #[inspect(skip)] - devices: Vec, - /// Next device ID. - #[inspect(skip)] - next_device_id: u64, - /// DMA mapper client for registering new IOAS contexts. - #[inspect(skip)] + /// Per-iommu manager senders, keyed by `--iommu` ID. + managers: HashMap>, + /// DMA mapper client, cloned for each new per-iommu manager. dma_mapper_client: DmaMapperClient, - #[inspect(skip)] + /// Spawner for per-iommu manager tasks. + spawner: Arc, + /// Per-iommu manager tasks (kept alive). + _tasks: Vec>, recv: mesh::Receiver, } -/// Client handle for the [`VfioCdevManager`] task. +/// Client handle for the [`VfioCdevManager`] dispatcher. #[derive(Clone)] pub struct VfioCdevManagerClient { sender: mesh::Sender, @@ -603,169 +747,105 @@ impl VfioCdevManagerClient { .call_failable(VfioCdevManagerRpc::PrepareDevice, req) .await?) } - - /// Returns a clone of the sender for passing to `VfioCdevBinding`. - pub(crate) fn sender(&self) -> mesh::Sender { - self.sender.clone() - } } impl VfioCdevManager { - /// Create a new cdev manager. - pub fn new(dma_mapper_client: DmaMapperClient) -> Self { + /// Create a new cdev dispatcher. + pub fn new( + spawner: Arc, + dma_mapper_client: DmaMapperClient, + ) -> Self { Self { - ioas_entries: HashMap::new(), - devices: Vec::new(), - next_device_id: 0, + managers: HashMap::new(), dma_mapper_client, + spawner, + _tasks: Vec::new(), recv: mesh::Receiver::new(), } } - /// Run the cdev manager task, processing RPCs until the channel closes. + /// Run the dispatcher, routing device requests to per-iommu managers. pub async fn run(mut self) { while let Ok(rpc) = self.recv.recv().await { match rpc { VfioCdevManagerRpc::PrepareDevice(rpc) => { - rpc.handle_failable(async |req| self.prepare_device(req).await) - .await + let (req, respond) = rpc.split(); + match self.route_prepare(req, respond) { + Ok(()) => {} // forwarded to per-iommu manager + Err((e, respond)) => { + respond.fail(e); + } + } } - VfioCdevManagerRpc::RemoveDevice(device_id) => { - self.remove_device(device_id); + VfioCdevManagerRpc::Inspect(deferred) => { + deferred.respond(|resp| { + for (iommu_id, sender) in &self.managers { + resp.child(iommu_id, |req| { + sender.send(IoasManagerRpc::Inspect(req.defer())); + }); + } + }); } - VfioCdevManagerRpc::Inspect(deferred) => deferred.inspect(&mut self), } } } - async fn prepare_device( + /// Route a prepare request to the per-iommu manager, spawning one + /// if needed. Returns Ok(()) if forwarded, or Err with the error + /// and response handle if spawning failed. + fn route_prepare( &mut self, req: CdevPrepareRequest, - ) -> anyhow::Result { - let CdevPrepareRequest { - pci_id, - cdev: cdev_file, - iommufd: iommufd_file, - iommu_id, - } = req; - - tracing::info!(pci_id, iommu_id, "cdev manager: preparing device"); - - // Get or create the IOAS for this iommu ID. - if !self.ioas_entries.contains_key(&iommu_id) { - let ctx = Arc::new(vfio_sys::iommufd::IommufdCtx::from_file(iommufd_file)); - let ioas_id = ctx - .ioas_alloc() - .context("failed to allocate iommufd IOAS")?; - - let dma_target: Arc = Arc::new(IommufdDmaTarget { - ctx: ctx.clone(), - ioas_id, - }); - let dma_handle = self - .dma_mapper_client - .add_dma_mapper(dma_target, true) - .await - .context("failed to register iommufd IOAS with region manager")?; - - tracing::info!( - iommu_id, - ioas_id, - "created shared iommufd IOAS for iommu context" - ); - - self.ioas_entries.insert( - iommu_id.clone(), - IoasEntry { - ctx, - ioas_id, - _dma_handle: dma_handle, - device_count: 0, - }, - ); - } - - let entry = self.ioas_entries.get_mut(&iommu_id).unwrap(); - let cdev = vfio_sys::cdev::CdevDevice::from_file(cdev_file); - - // Bind the cdev device to this iommu context's iommufd. - let devid = cdev - .bind_iommufd(entry.ctx.as_raw_fd()) - .context("failed to bind VFIO cdev to iommufd")?; - - // Attach the device to the shared IOAS. - cdev.attach_ioas(entry.ioas_id) - .context("failed to attach cdev device to IOAS")?; - - let device_id = self.next_device_id; - self.next_device_id += 1; - entry.device_count += 1; - - self.devices.push(CdevDeviceEntry { - id: device_id, - pci_id: pci_id.clone(), - iommu_id: iommu_id.clone(), - }); - - tracing::info!( - pci_id, - iommu_id, - iommufd_devid = devid, - ioas_id = entry.ioas_id, - device_id, - "VFIO cdev device attached to shared IOAS" - ); - - Ok(CdevPrepareResponse { - device: cdev.into_device(), - iommufd_devid: devid, - ioas_id: entry.ioas_id, - device_id, - }) - } - - fn remove_device(&mut self, device_id: u64) { - if let Some(pos) = self.devices.iter().position(|d| d.id == device_id) { - let entry = self.devices.swap_remove(pos); - tracing::info!( - device_id, - pci_id = entry.pci_id, - iommu_id = entry.iommu_id, - "removing cdev device" - ); - - if let Some(ioas) = self.ioas_entries.get_mut(&entry.iommu_id) { - ioas.device_count -= 1; - if ioas.device_count == 0 { - tracing::info!( - iommu_id = entry.iommu_id, - "closing iommufd IOAS (no remaining devices)" - ); - self.ioas_entries.remove(&entry.iommu_id); + respond: FailableRpc<(), CdevPrepareResponse>, + ) -> Result<(), (anyhow::Error, FailableRpc<(), CdevPrepareResponse>)> { + let sender = if let Some(sender) = self.managers.get(&req.iommu_id) { + sender.clone() + } else { + let iommu_id = req.iommu_id.clone(); + let mut ioas_recv: mesh::Receiver = mesh::Receiver::new(); + let sender = ioas_recv.sender(); + + let dma_mapper_client = self.dma_mapper_client.clone(); + let iommu_id2 = iommu_id.clone(); + // The iommufd fd from this first request initializes the manager. + // Subsequent requests' iommufd fds (dup'd from the same underlying + // fd) are carried along but ignored by the per-iommu manager. + let iommufd = match req.iommufd.try_clone() { + Ok(f) => f, + Err(e) => { + return Err(( + anyhow::Error::new(e).context("failed to dup iommufd fd"), + respond, + )); } - } - } - } - - fn inspect_topology(&self, resp: &mut inspect::Response<'_>) { - resp.child("ioas", |req| { - let mut resp = req.respond(); - for (iommu_id, entry) in &self.ioas_entries { - resp.child(iommu_id, |req| { - let mut resp = req.respond(); - resp.field("ioas_id", entry.ioas_id); - resp.field("device_count", entry.device_count); - resp.child("device", |req| { - let mut resp = req.respond(); - for dev in &self.devices { - if dev.iommu_id == *iommu_id { - resp.field(&dev.pci_id, ()); - } + }; + let task = self + .spawner + .spawn(format!("vfio-ioas-{iommu_id}"), async move { + match IoasManager::new(iommu_id2, iommufd, &dma_mapper_client, ioas_recv).await + { + Ok(mgr) => mgr.run().await, + Err(e) => { + tracing::error!( + error = format!("{e:#}"), + "failed to initialize iommufd IOAS manager" + ); + // The recv will be dropped, causing all pending + // and future RPCs to fail with channel-closed. } - }); + } }); - } + self._tasks.push(task); + self.managers.insert(iommu_id, sender.clone()); + sender + }; + + sender.send(IoasManagerRpc::PrepareDevice { + pci_id: req.pci_id, + cdev: req.cdev, + respond, }); + Ok(()) } pub(crate) fn client(&mut self) -> VfioCdevManagerClient { @@ -778,8 +858,7 @@ impl VfioCdevManager { /// Binding for a VFIO device opened via the cdev + iommufd path. /// /// Analogous to [`VfioDeviceBinding`] for the legacy group path. -/// Notifies the cdev manager on drop so device counts stay accurate -/// and IOAS contexts are cleaned up when the last device is removed. +/// Notifies the per-iommu manager on drop so device counts stay accurate. #[derive(Inspect)] pub(crate) struct VfioCdevBinding { /// PCI BDF address on the host. @@ -791,35 +870,31 @@ pub(crate) struct VfioCdevBinding { iommufd_devid: u32, /// IOAS ID this device is attached to. ioas_id: u32, - /// Device ID assigned by the manager (for drop notification). + /// Device ID assigned by the per-iommu manager (for drop notification). #[inspect(skip)] device_id: u64, - /// Sender to notify the manager on drop. + /// Sender to the per-iommu manager for drop notification. #[inspect(skip)] - sender: mesh::Sender, + manager_send: mesh::Sender, } impl VfioCdevBinding { - /// Create from a manager response. - pub(crate) fn from_response( - resp: CdevPrepareResponse, - pci_id: String, - sender: mesh::Sender, - ) -> Self { + /// Create from a dispatcher response. + pub(crate) fn from_response(resp: CdevPrepareResponse, pci_id: String) -> Self { Self { pci_id, device: resp.device, iommufd_devid: resp.iommufd_devid, ioas_id: resp.ioas_id, device_id: resp.device_id, - sender, + manager_send: resp.manager_send, } } /// Consume the binding and split into the `Device` (for constructing /// `VfioAssignedPciDevice`) and the remaining binding state (for - /// lifetime management). The state's `Drop` impl notifies the manager - /// when the device is released. + /// lifetime management). The state's `Drop` impl notifies the per-iommu + /// manager when the device is released. pub fn into_parts(self) -> (vfio_sys::Device, VfioCdevBindingState) { let Self { pci_id, @@ -827,7 +902,7 @@ impl VfioCdevBinding { iommufd_devid, ioas_id, device_id, - sender, + manager_send, } = self; ( device, @@ -836,7 +911,7 @@ impl VfioCdevBinding { iommufd_devid, ioas_id, device_id, - sender, + manager_send, }, ) } @@ -845,8 +920,7 @@ impl VfioCdevBinding { /// The iommufd-related state from a [`VfioCdevBinding`], kept alive for /// the lifetime of the assigned device. /// -/// Notifies the cdev manager on drop so device counts and IOAS contexts -/// are cleaned up. +/// Notifies the per-iommu manager on drop so device counts are accurate. #[derive(Inspect)] pub(crate) struct VfioCdevBindingState { pci_id: String, @@ -855,13 +929,13 @@ pub(crate) struct VfioCdevBindingState { #[inspect(skip)] device_id: u64, #[inspect(skip)] - sender: mesh::Sender, + manager_send: mesh::Sender, } impl Drop for VfioCdevBindingState { fn drop(&mut self) { - self.sender - .send(VfioCdevManagerRpc::RemoveDevice(self.device_id)); + self.manager_send + .send(IoasManagerRpc::RemoveDevice(self.device_id)); } } diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index a21ca829d3..a91ee25191 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -9,8 +9,10 @@ use crate::manager::VfioManagerClient; use anyhow::Context as _; use async_trait::async_trait; use membacking::DmaMapperClient; +use pal_async::task::Spawn as _; use pci_resources::ResolvePciDeviceHandleParams; use pci_resources::ResolvedPciDevice; +use std::sync::Arc; use vfio_assigned_device_resources::VfioCdevDeviceHandle; use vfio_assigned_device_resources::VfioDeviceHandle; use vm_resource::AsyncResolveResource; @@ -100,11 +102,16 @@ pub struct VfioCdevDeviceResolver { } impl VfioCdevDeviceResolver { - /// Create a new cdev resolver, spawning the cdev manager task. - pub fn new(spawner: impl pal_async::task::Spawn, dma_mapper_client: DmaMapperClient) -> Self { - let mut manager = crate::manager::VfioCdevManager::new(dma_mapper_client); + /// Create a new cdev resolver, spawning the cdev dispatcher task. + pub fn new( + spawner: impl pal_async::task::Spawn + 'static, + dma_mapper_client: DmaMapperClient, + ) -> Self { + // Arc the spawner so the dispatcher can spawn per-iommu manager tasks. + let spawner: Arc = Arc::new(spawner); + let mut manager = crate::manager::VfioCdevManager::new(spawner.clone(), dma_mapper_client); let client = manager.client(); - let task = spawner.spawn("vfio-cdev-mgr", manager.run()); + let task = spawner.spawn("vfio-cdev-dispatch", manager.run()); Self { client, _task: task, @@ -148,11 +155,7 @@ impl AsyncResolveResource for VfioCde .await .context("VFIO cdev manager failed")?; - let cdev_binding = crate::manager::VfioCdevBinding::from_response( - resp, - pci_id.clone(), - self.client.sender(), - ); + let cdev_binding = crate::manager::VfioCdevBinding::from_response(resp, pci_id.clone()); let memory_mapper = input .shared_mem_mapper diff --git a/vm/vmcore/vm_topology/src/memory.rs b/vm/vmcore/vm_topology/src/memory.rs index 0bab55be39..bf0fe05ea6 100644 --- a/vm/vmcore/vm_topology/src/memory.rs +++ b/vm/vmcore/vm_topology/src/memory.rs @@ -246,42 +246,6 @@ impl MemoryLayout { } } - // On aarch64, the physical IOMMU reserves 128MB..129MB for the MSI - // doorbell window. iommufd inherits this and rejects DMA mappings - // that overlap it. Split any RAM range that crosses this window so - // that the region manager never maps it. - // - // TODO: query the actual reserved ranges from iommufd at runtime - // via `IOMMU_IOAS_IOVA_RANGES` instead of hardcoding. - #[cfg(guest_arch = "aarch64")] - { - const IOMMU_MSI_RESERVED: MemoryRange = MemoryRange::new(0x800_0000..0x810_0000); - let mut split_ram = Vec::with_capacity(ram.len() + 2); - for entry in ram { - if !entry.range.overlaps(&IOMMU_MSI_RESERVED) { - split_ram.push(entry); - } else { - // Part before the reserved window. - if entry.range.start() < IOMMU_MSI_RESERVED.start() { - split_ram.push(MemoryRangeWithNode { - range: MemoryRange::new( - entry.range.start()..IOMMU_MSI_RESERVED.start(), - ), - vnode: entry.vnode, - }); - } - // Part after the reserved window. - if entry.range.end() > IOMMU_MSI_RESERVED.end() { - split_ram.push(MemoryRangeWithNode { - range: MemoryRange::new(IOMMU_MSI_RESERVED.end()..entry.range.end()), - vnode: entry.vnode, - }); - } - } - } - ram = split_ram; - } - Self::build( ram, mmio_gaps.to_vec(), From fcaffc88a64ce40605716f9446bd66b01cbb2a3d Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 13 May 2026 22:55:26 -0700 Subject: [PATCH 06/10] less clone --- .../pci/vfio_assigned_device/src/manager.rs | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 7584f51a13..408ce1abff 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -798,27 +798,21 @@ impl VfioCdevManager { req: CdevPrepareRequest, respond: FailableRpc<(), CdevPrepareResponse>, ) -> Result<(), (anyhow::Error, FailableRpc<(), CdevPrepareResponse>)> { - let sender = if let Some(sender) = self.managers.get(&req.iommu_id) { + let CdevPrepareRequest { + pci_id, + cdev, + iommufd, + iommu_id, + } = req; + + let sender = if let Some(sender) = self.managers.get(&iommu_id) { sender.clone() } else { - let iommu_id = req.iommu_id.clone(); let mut ioas_recv: mesh::Receiver = mesh::Receiver::new(); let sender = ioas_recv.sender(); let dma_mapper_client = self.dma_mapper_client.clone(); let iommu_id2 = iommu_id.clone(); - // The iommufd fd from this first request initializes the manager. - // Subsequent requests' iommufd fds (dup'd from the same underlying - // fd) are carried along but ignored by the per-iommu manager. - let iommufd = match req.iommufd.try_clone() { - Ok(f) => f, - Err(e) => { - return Err(( - anyhow::Error::new(e).context("failed to dup iommufd fd"), - respond, - )); - } - }; let task = self .spawner .spawn(format!("vfio-ioas-{iommu_id}"), async move { @@ -841,8 +835,8 @@ impl VfioCdevManager { }; sender.send(IoasManagerRpc::PrepareDevice { - pci_id: req.pci_id, - cdev: req.cdev, + pci_id, + cdev, respond, }); Ok(()) From f1888311a39174addd2d1f14dac76da123f10076 Mon Sep 17 00:00:00 2001 From: John Starks Date: Tue, 19 May 2026 23:28:08 -0700 Subject: [PATCH 07/10] feedback --- .../pci/vfio_assigned_device/src/manager.rs | 81 ++++++++++--------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 408ce1abff..9cc636b7d6 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -721,7 +721,7 @@ pub(crate) struct VfioCdevManager { /// Spawner for per-iommu manager tasks. spawner: Arc, /// Per-iommu manager tasks (kept alive). - _tasks: Vec>, + tasks: Vec>, recv: mesh::Receiver, } @@ -759,7 +759,7 @@ impl VfioCdevManager { managers: HashMap::new(), dma_mapper_client, spawner, - _tasks: Vec::new(), + tasks: Vec::new(), recv: mesh::Receiver::new(), } } @@ -770,12 +770,7 @@ impl VfioCdevManager { match rpc { VfioCdevManagerRpc::PrepareDevice(rpc) => { let (req, respond) = rpc.split(); - match self.route_prepare(req, respond) { - Ok(()) => {} // forwarded to per-iommu manager - Err((e, respond)) => { - respond.fail(e); - } - } + self.route_prepare(req, respond).await; } VfioCdevManagerRpc::Inspect(deferred) => { deferred.respond(|resp| { @@ -791,13 +786,18 @@ impl VfioCdevManager { } /// Route a prepare request to the per-iommu manager, spawning one - /// if needed. Returns Ok(()) if forwarded, or Err with the error - /// and response handle if spawning failed. - fn route_prepare( + /// if needed. Initializes the per-iommu manager inline on first use + /// so that init failures are reported directly to the caller. + /// + /// The actual bind/attach ioctls are forwarded to the per-iommu + /// manager task via fire-and-forget send, so the dispatcher is + /// immediately free to handle the next request. This allows devices + /// on different `--iommu` contexts to be prepared concurrently. + async fn route_prepare( &mut self, req: CdevPrepareRequest, respond: FailableRpc<(), CdevPrepareResponse>, - ) -> Result<(), (anyhow::Error, FailableRpc<(), CdevPrepareResponse>)> { + ) { let CdevPrepareRequest { pci_id, cdev, @@ -805,41 +805,44 @@ impl VfioCdevManager { iommu_id, } = req; - let sender = if let Some(sender) = self.managers.get(&iommu_id) { - sender.clone() - } else { - let mut ioas_recv: mesh::Receiver = mesh::Receiver::new(); - let sender = ioas_recv.sender(); - - let dma_mapper_client = self.dma_mapper_client.clone(); - let iommu_id2 = iommu_id.clone(); - let task = self - .spawner - .spawn(format!("vfio-ioas-{iommu_id}"), async move { - match IoasManager::new(iommu_id2, iommufd, &dma_mapper_client, ioas_recv).await - { - Ok(mgr) => mgr.run().await, - Err(e) => { - tracing::error!( - error = format!("{e:#}"), - "failed to initialize iommufd IOAS manager" - ); - // The recv will be dropped, causing all pending - // and future RPCs to fail with channel-closed. - } + let sender = match self.managers.entry(iommu_id.clone()) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + let mut ioas_recv: mesh::Receiver = mesh::Receiver::new(); + let sender = ioas_recv.sender(); + + let mgr = match IoasManager::new( + iommu_id.clone(), + iommufd, + &self.dma_mapper_client, + ioas_recv, + ) + .await + .with_context(|| { + format!("failed to initialize iommufd IOAS manager for iommu={iommu_id}") + }) { + Ok(mgr) => mgr, + Err(e) => { + respond.fail(e); + return; } - }); - self._tasks.push(task); - self.managers.insert(iommu_id, sender.clone()); - sender + }; + + let task = self + .spawner + .spawn(format!("vfio-ioas-{iommu_id}"), mgr.run()); + self.tasks.push(task); + e.insert(sender) + } }; + // Forward to the per-iommu manager task. The manager will + // complete the respond half after the bind/attach ioctls. sender.send(IoasManagerRpc::PrepareDevice { pci_id, cdev, respond, }); - Ok(()) } pub(crate) fn client(&mut self) -> VfioCdevManagerClient { From 82b062b81ec0c652955a7a63dcf985acabe049b1 Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 20 May 2026 09:16:24 -0700 Subject: [PATCH 08/10] feedback --- openvmm/openvmm_entry/src/lib.rs | 7 ++++--- vm/devices/user_driver/vfio_sys/src/lib.rs | 8 -------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 93bff58234..abeccd2fa8 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -791,9 +791,10 @@ async fn vm_config_from_command_line( cli_cfg.pci_id ) })?; - // Open a cloned iommufd fd (each device binding needs its own fd - // for VFIO_DEVICE_BIND_IOMMUFD, but they can share the same - // underlying iommufd kernel object via dup). + // Clone the iommufd fd so the per-iommu manager can own it. + // The first device for a given iommu ID uses the cloned fd + // to create the IoasManager; subsequent devices reuse the + // existing manager and the cloned fd is dropped. let iommufd = iommufd.try_clone().with_context(|| { format!("failed to dup iommufd fd for iommu={iommu_id}") })?; diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index ae887228ec..abe6fc4032 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -216,14 +216,6 @@ impl Container { ioctl::vfio_iommu_unmap_dma(self.file.as_raw_fd(), &mut dma_unmap) .context("VFIO_IOMMU_UNMAP_DMA failed")?; } - if dma_unmap.size != size { - tracing::warn!( - iova, - requested = size, - actual = dma_unmap.size, - "VFIO_IOMMU_UNMAP_DMA: unmapped size differs from requested" - ); - } Ok(()) } } From dafcd9948782980e1d9a56664fea4fe2fd381108 Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 20 May 2026 20:23:37 +0000 Subject: [PATCH 09/10] feedback --- vm/devices/pci/vfio_assigned_device/src/manager.rs | 2 +- vm/devices/pci/vfio_assigned_device/src/resolver.rs | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 9cc636b7d6..35d41acf42 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -725,7 +725,7 @@ pub(crate) struct VfioCdevManager { recv: mesh::Receiver, } -/// Client handle for the [`VfioCdevManager`] dispatcher. +/// Client handle for the `VfioCdevManager` dispatcher. #[derive(Clone)] pub struct VfioCdevManagerClient { sender: mesh::Sender, diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index a91ee25191..3310bac5ef 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -93,9 +93,8 @@ impl AsyncResolveResource for VfioDeviceR /// Resource resolver for [`VfioCdevDeviceHandle`] (cdev + iommufd path). /// -/// Spawns a [`VfioCdevManager`](crate::manager::VfioCdevManager) task -/// internally and communicates with it via RPC to share IOAS contexts -/// across devices referencing the same `--iommu` ID. +/// Spawns a `VfioCdevManager` task internally and communicates with it via RPC +/// to share IOAS contexts across devices referencing the same iommu ID. pub struct VfioCdevDeviceResolver { client: crate::manager::VfioCdevManagerClient, _task: pal_async::task::Task<()>, From 7e04a75b1994859b62321b585b6dd0a232be44a7 Mon Sep 17 00:00:00 2001 From: John Starks Date: Wed, 20 May 2026 21:19:22 +0000 Subject: [PATCH 10/10] feedback --- .../pci/vfio_assigned_device/src/manager.rs | 46 +++++-------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 35d41acf42..41b5c176c7 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -145,17 +145,12 @@ pub(crate) struct VfioContainerManager { /// /// Inspecting this sends a deferred inspect request to the container manager /// task, which reports the container/group/device topology. -#[derive(Clone)] +#[derive(Clone, Inspect)] pub struct VfioManagerClient { + #[inspect(flatten, send = "VfioManagerRpc::Inspect")] sender: mesh::Sender, } -impl Inspect for VfioManagerClient { - fn inspect(&self, req: inspect::Request<'_>) { - self.sender.send(VfioManagerRpc::Inspect(req.defer())); - } -} - impl VfioManagerClient { pub(crate) async fn prepare_device( &self, @@ -532,16 +527,22 @@ pub(crate) enum IoasManagerRpc { /// referencing the same `--iommu` ID share one IOAS — one set of IOMMU /// page tables, one DMA mapper registration. Devices on different /// `--iommu` IDs are handled by separate `IoasManager` tasks concurrently. +#[derive(Inspect)] struct IoasManager { iommu_id: String, + #[inspect(skip)] ctx: Arc, ioas_id: u32, /// Keeps the DMA mapper registered with the region manager. + #[inspect(skip)] _dma_handle: membacking::DmaMapperHandle, /// Active devices on this IOAS. + #[inspect(with = "|x| inspect::iter_by_key(x.iter().map(|d| (&d.pci_id, ())))")] devices: Vec, /// Next device ID (unique within this manager). + #[inspect(skip)] next_device_id: u64, + #[inspect(skip)] recv: mesh::Receiver, } @@ -666,17 +667,6 @@ impl IoasManager { } } -impl Inspect for IoasManager { - fn inspect(&self, req: inspect::Request<'_>) { - let mut resp = req.respond(); - resp.field("ioas_id", self.ioas_id); - resp.field("device_count", self.devices.len()); - for dev in &self.devices { - resp.field(&dev.pci_id, ()); - } - } -} - // --- Cdev dispatcher (VfioCdevManager) --- /// RPC messages for the cdev dispatcher. @@ -726,17 +716,12 @@ pub(crate) struct VfioCdevManager { } /// Client handle for the `VfioCdevManager` dispatcher. -#[derive(Clone)] +#[derive(Clone, Inspect)] pub struct VfioCdevManagerClient { + #[inspect(flatten, send = "VfioCdevManagerRpc::Inspect")] sender: mesh::Sender, } -impl Inspect for VfioCdevManagerClient { - fn inspect(&self, req: inspect::Request<'_>) { - self.sender.send(VfioCdevManagerRpc::Inspect(req.defer())); - } -} - impl VfioCdevManagerClient { pub(crate) async fn prepare_device( &self, @@ -940,16 +925,9 @@ impl Drop for VfioCdevBindingState { /// /// Kept as a field on `VfioAssignedPciDevice` to hold the underlying /// fd/handle resources alive for the device's lifetime. +#[derive(Inspect)] +#[inspect(external_tag)] pub(crate) enum VfioBinding { Group(VfioDeviceBinding), Cdev(VfioCdevBindingState), } - -impl Inspect for VfioBinding { - fn inspect(&self, req: inspect::Request<'_>) { - match self { - VfioBinding::Group(b) => b.inspect(req), - VfioBinding::Cdev(b) => b.inspect(req), - } - } -}