diff --git a/Guide/src/reference/openvmm/management/cli.md b/Guide/src/reference/openvmm/management/cli.md index 4e9a2058c2..195c9444b7 100644 --- a/Guide/src/reference/openvmm/management/cli.md +++ b/Guide/src/reference/openvmm/management/cli.md @@ -215,8 +215,12 @@ For `--virtio-rng` and `--virtio-console`, use their separate PCIe port flags: --vhost-user /tmp/virtiofsd.sock,type=fs,tag=myfs,pcie_port=rp0 ``` -**VFIO device assignment** (Linux only): `--vfio` +**VFIO device assignment** (Linux only): `--vfio` (and optional `--iommu`) ```sh ---vfio rp0:0000:01:00.0 +# Legacy VFIO group/container path: +--vfio host=0000:01:00.0,port=rp0 + +# Modern VFIO cdev + iommufd path (Linux >= 6.6): +--iommu id=iommu0 --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 ``` diff --git a/Guide/src/user_guide/openvmm/vfio.md b/Guide/src/user_guide/openvmm/vfio.md index 3a6b59f43d..5755dd283e 100644 --- a/Guide/src/user_guide/openvmm/vfio.md +++ b/Guide/src/user_guide/openvmm/vfio.md @@ -104,7 +104,7 @@ Use the `--vfio` flag to assign the device to a PCIe root port. You also need to sudo openvmm \ --pcie-root-complex rc0 \ --pcie-root-port rc0:rp0 \ - --vfio rp0:0000:01:00.0 \ + --vfio host=0000:01:00.0,port=rp0 \ --kernel /path/to/vmlinux \ --initrd /path/to/initrd \ --cmdline "console=ttyS0" \ @@ -113,20 +113,47 @@ sudo openvmm \ --processors 2 ``` -The `--vfio` syntax is `:`: +The `--vfio` value is a comma-separated list of `key=value` pairs: -- `rp0` — the name of the PCIe root port to attach the device to (must match a `--pcie-root-port` name) -- `0000:01:00.0` — the PCI BDF of the VFIO device on the host +- `host=` (required) — the PCI BDF of the VFIO device on the host (e.g., `0000:01:00.0`) +- `port=` (required) — the name of the PCIe root port to attach the device to (must match a `--pcie-root-port` name) +- `iommu=` (optional) — reference to an `--iommu` context; see [Using iommufd (cdev path)](#using-iommufd-cdev-path) below ```admonish tip You can assign multiple devices by adding more root ports and `--vfio` flags: --pcie-root-port rc0:rp0 \ --pcie-root-port rc0:rp1 \ - --vfio rp0:0000:01:00.0 \ - --vfio rp1:334c:00:00.0 + --vfio host=0000:01:00.0,port=rp0 \ + --vfio host=334c:00:00.0,port=rp1 ``` +### Using iommufd (cdev path) + +By default, `--vfio` uses the legacy VFIO group/container interface with the +Type1v2 IOMMU driver. On hosts with Linux kernel 6.6 or newer, OpenVMM can +instead use the modern VFIO cdev (per-device fd) + iommufd interface. Enable +it by declaring an `--iommu` context and referencing it from each `--vfio` +device with the `iommu=` key: + +```bash +sudo openvmm \ + --pcie-root-complex rc0 \ + --pcie-root-port rc0:rp0 \ + --iommu id=iommu0 \ + --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 \ + ... +``` + +The `--iommu` syntax is `id=`. All `--vfio` devices that reference the +same `id` share a single iommufd IOAS (one set of IOMMU page tables and one +DMA mapper registration). The IOAS is allocated on demand the first time a +device referencing the id is opened. + +Devices opened via the cdev path read their device node from +`/sys/bus/pci/devices//vfio-dev/vfioN` and open +`/dev/vfio/devices/vfioN` instead of `/dev/vfio/`. + ## Step 6: Verify in the guest If the guest boots with PCI support, the assigned device should be visible: @@ -164,7 +191,7 @@ Then request hugepage-backed RAM with the `--memory` option: sudo openvmm \ --pcie-root-complex rc0 \ --pcie-root-port rc0:rp0 \ - --vfio rp0:0000:01:00.0 \ + --vfio host=0000:01:00.0,port=rp0 \ --kernel /path/to/vmlinux \ --initrd /path/to/initrd \ --cmdline "console=ttyS0" \ diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 4cbe51c1b8..1f89cc88a1 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -713,6 +713,9 @@ struct LoadedVmInner { /// VFIO container manager inspect handle (Linux only). #[cfg(target_os = "linux")] vfio_inspect: Option, + /// VFIO cdev + iommufd manager inspect handle (Linux only). + #[cfg(target_os = "linux")] + vfio_cdev_inspect: Option, // relay halt messages, intercepting reset if configured. halt_recv: mesh::Receiver, @@ -1953,10 +1956,11 @@ impl InitializedVm { // Register the VFIO resolver, which spawns a container manager task // internally to share containers across assigned devices. #[cfg(target_os = "linux")] - let vfio_inspect = { + let (vfio_inspect, vfio_cdev_inspect) = { + let dma_mapper_client = memory_manager.dma_mapper_client(); let vfio_resolver = vfio_assigned_device::resolver::VfioDeviceResolver::new( driver_source.builder().build("vfio-container-mgr"), - memory_manager.dma_mapper_client(), + dma_mapper_client.clone(), ); let handle = vfio_resolver.inspect_handle(); resolver.add_async_resolver::< @@ -1965,7 +1969,23 @@ impl InitializedVm { vfio_assigned_device_resources::VfioDeviceHandle, _, >(vfio_resolver); - Some(handle) + + // Register the VFIO cdev + iommufd resolver for devices opened + // via the cdev interface. Spawns a VfioCdevManager task that + // shares IOAS contexts across devices with the same --iommu ID. + let cdev_resolver = vfio_assigned_device::resolver::VfioCdevDeviceResolver::new( + driver_source.builder().build("vfio-cdev-mgr"), + dma_mapper_client, + ); + let cdev_handle = cdev_resolver.inspect_handle(); + resolver.add_async_resolver::< + vm_resource::kind::PciDeviceHandleKind, + _, + vfio_assigned_device_resources::VfioCdevDeviceHandle, + _, + >(cdev_resolver); + + (Some(handle), Some(cdev_handle)) }; // Resolve PCIe devices concurrently. @@ -2545,6 +2565,8 @@ impl InitializedVm { vmgs_client_inspect_handle, #[cfg(target_os = "linux")] vfio_inspect, + #[cfg(target_os = "linux")] + vfio_cdev_inspect, halt_recv, client_notify_send, automatic_guest_reset: cfg.automatic_guest_reset, @@ -2937,7 +2959,8 @@ impl LoadedVm { .field("resolver", &self.inner.resolver) .field("vmgs", &self.inner.vmgs_client_inspect_handle); #[cfg(target_os = "linux")] - resp.field("vfio", &self.inner.vfio_inspect); + resp.field("vfio", &self.inner.vfio_inspect) + .field("vfio_cdev", &self.inner.vfio_cdev_inspect); }), }, Event::VmRpc(Err(_)) => break, diff --git a/openvmm/openvmm_entry/src/cli_args.rs b/openvmm/openvmm_entry/src/cli_args.rs index b9889e8ad5..1a133976d5 100644 --- a/openvmm/openvmm_entry/src/cli_args.rs +++ b/openvmm/openvmm_entry/src/cli_args.rs @@ -904,18 +904,35 @@ Assign a host PCI device to the guest via Linux VFIO. The device must be bound to vfio-pci on the host before starting the VM. Examples: - # Assign NVMe controller to root port rp0 - --vfio rp0:0000:01:00.0 + --vfio host=0000:01:00.0,port=rp0 + --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 + +Keys: + host= (required) PCI address on the host + port= (required) Root port or downstream switch port name + iommu= (optional) Reference to an --iommu object. When present, + uses VFIO cdev + iommufd instead of the legacy group path. +"#)] + #[cfg(target_os = "linux")] + #[clap(long, conflicts_with("pcat"))] + pub vfio: Vec, -Syntax: : + /// Create an iommufd context for VFIO cdev device assignment + #[clap(long_help = r#" +Declare an iommufd context. Opens /dev/iommu so it can be referenced by +--vfio devices via the iommu= key. The associated IOAS is allocated +the first time a --vfio device referring to this id is opened. + +Requires Linux kernel >= 6.6 with iommufd support. + +Examples: + --iommu id=iommu0 --vfio host=0000:01:00.0,port=rp0,iommu=iommu0 - port_name Root port or downstream switch port name - pci_bdf PCI domain:bus:device.function of the VFIO device on - the host (use lspci -D to find it) +Syntax: id= "#)] #[cfg(target_os = "linux")] #[clap(long, conflicts_with("pcat"))] - pub vfio: Vec, + pub iommu: Vec, } impl Options { @@ -2427,6 +2444,8 @@ impl FromStr for PcieRemoteCli { } /// CLI configuration for a VFIO-assigned PCI device. +/// +/// Syntax: `host=,port=[,iommu=]` #[cfg(target_os = "linux")] #[derive(Clone, Debug)] pub struct VfioDeviceCli { @@ -2434,6 +2453,9 @@ pub struct VfioDeviceCli { pub port_name: String, /// PCI BDF address of the device on the host (e.g., "0000:01:00.0"). pub pci_id: String, + /// Optional iommufd context ID. When set, uses VFIO cdev + iommufd + /// instead of the legacy group/container path. + pub iommu: Option, } #[cfg(target_os = "linux")] @@ -2441,17 +2463,42 @@ impl FromStr for VfioDeviceCli { type Err = anyhow::Error; fn from_str(s: &str) -> Result { - let (port_name, pci_id) = s - .split_once(':') - .context("expected : (e.g., rp0:0000:01:00.0)")?; + let mut host: Option = None; + let mut port: Option = None; + let mut iommu: Option = None; - if port_name.is_empty() { - anyhow::bail!("port name cannot be empty"); + for kv in s.split(',') { + let (key, value) = kv + .split_once('=') + .context("expected key=value pair (e.g., host=0000:01:00.0,port=rp0)")?; + if value.is_empty() { + anyhow::bail!("--vfio: '{key}=' value cannot be empty"); + } + match key { + "host" => { + if host.is_some() { + anyhow::bail!("duplicate --vfio key: 'host'"); + } + host = Some(value.to_string()); + } + "port" => { + if port.is_some() { + anyhow::bail!("duplicate --vfio key: 'port'"); + } + port = Some(value.to_string()); + } + "iommu" => { + if iommu.is_some() { + anyhow::bail!("duplicate --vfio key: 'iommu'"); + } + iommu = Some(value.to_string()); + } + _ => anyhow::bail!("unknown --vfio key: '{key}'"), + } } - if pci_id.is_empty() { - anyhow::bail!("PCI address cannot be empty"); - } + let pci_id = host.context("--vfio: 'host=' is required")?; + let port_name = port.context("--vfio: 'port=' is required")?; // Reject path separators to prevent sysfs path traversal via Path::join. if pci_id.contains('/') || pci_id.contains("..") { @@ -2459,8 +2506,39 @@ impl FromStr for VfioDeviceCli { } Ok(VfioDeviceCli { - port_name: port_name.to_string(), - pci_id: pci_id.to_string(), + port_name, + pci_id, + iommu, + }) + } +} + +/// CLI configuration for an iommufd context. +/// +/// Syntax: `id=` +#[cfg(target_os = "linux")] +#[derive(Clone, Debug)] +pub struct IommuCli { + /// Unique identifier for this iommufd context. + pub id: String, +} + +#[cfg(target_os = "linux")] +impl FromStr for IommuCli { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let (key, value) = s + .split_once('=') + .context("expected id= (e.g., id=iommu0)")?; + if key != "id" { + anyhow::bail!("expected 'id=', got '{key}=...'"); + } + if value.is_empty() { + anyhow::bail!("iommu id cannot be empty"); + } + Ok(IommuCli { + id: value.to_string(), }) } } @@ -3834,4 +3912,65 @@ mod tests { let opt = Options::try_parse_from(["openvmm", "--pidfile", "/tmp/test.pid"]).unwrap(); assert_eq!(opt.pidfile, Some(PathBuf::from("/tmp/test.pid"))); } + + #[cfg(target_os = "linux")] + #[test] + fn test_vfio_device_cli_parse() { + // Required keys only. + let v = VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0").unwrap(); + assert_eq!(v.pci_id, "0000:01:00.0"); + assert_eq!(v.port_name, "rp0"); + assert_eq!(v.iommu, None); + + // With optional iommu= key. Keys may appear in any order. + let v = VfioDeviceCli::from_str("port=rp1,iommu=iommu0,host=0000:02:00.0").unwrap(); + assert_eq!(v.pci_id, "0000:02:00.0"); + assert_eq!(v.port_name, "rp1"); + assert_eq!(v.iommu.as_deref(), Some("iommu0")); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_vfio_device_cli_errors() { + // Missing required keys. + assert!(VfioDeviceCli::from_str("port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0").is_err()); + + // Unknown key. + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,foo=bar").is_err()); + + // Duplicate keys are rejected. + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,host=0000:02:00.0,port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,port=rp1").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,iommu=a,iommu=b").is_err()); + + // Empty values are rejected. + assert!(VfioDeviceCli::from_str("host=,port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,iommu=").is_err()); + + // Missing '=' separator. + assert!(VfioDeviceCli::from_str("host").is_err()); + assert!(VfioDeviceCli::from_str("host=0000:01:00.0,port=rp0,iommu").is_err()); + + // Path-traversal characters in the host BDF are rejected. + assert!(VfioDeviceCli::from_str("host=../../etc/passwd,port=rp0").is_err()); + assert!(VfioDeviceCli::from_str("host=foo/bar,port=rp0").is_err()); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_iommu_cli_parse() { + let c = IommuCli::from_str("id=iommu0").unwrap(); + assert_eq!(c.id, "iommu0"); + + // Wrong key. + assert!(IommuCli::from_str("name=iommu0").is_err()); + + // Missing '=' separator. + assert!(IommuCli::from_str("iommu0").is_err()); + + // Empty id. + assert!(IommuCli::from_str("id=").is_err()); + } } diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index 6afbc14913..abeccd2fa8 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -757,37 +757,108 @@ async fn vm_config_from_command_line( let pcie_switches = build_switch_list(&opt.pcie_switch); #[cfg(target_os = "linux")] - let vfio_pcie_devices: Vec = opt - .vfio - .iter() - .map(|cli_cfg| { - use vm_resource::IntoResource; - - let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id); - let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group")) - .with_context(|| format!("failed to read IOMMU group for {}", cli_cfg.pci_id))?; - let group_id: u64 = iommu_group_link - .file_name() - .and_then(|s| s.to_str()) - .context("invalid iommu_group symlink")? - .parse() - .context("failed to parse IOMMU group ID")?; - let group = std::fs::OpenOptions::new() + let vfio_pcie_devices: Vec = { + use std::collections::HashMap; + use vm_resource::IntoResource; + + // Process --iommu flags: open /dev/iommu for each declared context. + let mut iommu_map: HashMap = HashMap::new(); + for iommu_cli in &opt.iommu { + anyhow::ensure!( + !iommu_map.contains_key(&iommu_cli.id), + "duplicate --iommu id={}", + iommu_cli.id + ); + let file = std::fs::OpenOptions::new() .read(true) .write(true) - .open(format!("/dev/vfio/{group_id}")) - .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?; - - Ok(PcieDeviceConfig { - port_name: cli_cfg.port_name.clone(), - resource: vfio_assigned_device_resources::VfioDeviceHandle { - pci_id: cli_cfg.pci_id.clone(), - group, + .open("/dev/iommu") + .context("failed to open /dev/iommu (is iommufd available?)")?; + iommu_map.insert(iommu_cli.id.clone(), file); + } + + opt.vfio + .iter() + .map(|cli_cfg| { + let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id); + + if let Some(iommu_id) = &cli_cfg.iommu { + // cdev + iommufd path + let iommufd = iommu_map.get(iommu_id).with_context(|| { + format!( + "--vfio device {} references iommu={iommu_id}, \ + but no --iommu id={iommu_id} was specified", + cli_cfg.pci_id + ) + })?; + // Clone the iommufd fd so the per-iommu manager can own it. + // The first device for a given iommu ID uses the cloned fd + // to create the IoasManager; subsequent devices reuse the + // existing manager and the cloned fd is dropped. + let iommufd = iommufd.try_clone().with_context(|| { + format!("failed to dup iommufd fd for iommu={iommu_id}") + })?; + + // Open the cdev device node. + let vfio_dev_dir = sysfs_path.join("vfio-dev"); + let entry = std::fs::read_dir(&vfio_dev_dir) + .with_context(|| { + format!( + "failed to read {}: is {} bound to vfio-pci?", + vfio_dev_dir.display(), + cli_cfg.pci_id + ) + })? + .next() + .context("no vfio-dev entry found")? + .context("failed to read vfio-dev entry")?; + let dev_path = Path::new("/dev/vfio/devices").join(entry.file_name()); + let cdev = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(&dev_path) + .with_context(|| format!("failed to open {}", dev_path.display()))?; + + Ok(PcieDeviceConfig { + port_name: cli_cfg.port_name.clone(), + resource: vfio_assigned_device_resources::VfioCdevDeviceHandle { + pci_id: cli_cfg.pci_id.clone(), + cdev, + iommufd, + iommu_id: iommu_id.clone(), + } + .into_resource(), + }) + } else { + // Legacy group/container path + let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group")) + .with_context(|| { + format!("failed to read IOMMU group for {}", cli_cfg.pci_id) + })?; + let group_id: u64 = iommu_group_link + .file_name() + .and_then(|s| s.to_str()) + .context("invalid iommu_group symlink")? + .parse() + .context("failed to parse IOMMU group ID")?; + let group = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(format!("/dev/vfio/{group_id}")) + .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?; + + Ok(PcieDeviceConfig { + port_name: cli_cfg.port_name.clone(), + resource: vfio_assigned_device_resources::VfioDeviceHandle { + pci_id: cli_cfg.pci_id.clone(), + group, + } + .into_resource(), + }) } - .into_resource(), }) - }) - .collect::>>()?; + .collect::>>()? + }; #[cfg(windows)] let vpci_resources: Vec<_> = opt diff --git a/vm/devices/pci/vfio_assigned_device/src/lib.rs b/vm/devices/pci/vfio_assigned_device/src/lib.rs index 6d611c4188..2b86c0dba3 100644 --- a/vm/devices/pci/vfio_assigned_device/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device/src/lib.rs @@ -194,9 +194,9 @@ pub(crate) struct VfioAssignedPciDevice { )] config_patches: BTreeMap, - /// VFIO container/group binding. Keeps the container and group fds alive - /// and notifies the container manager on drop. - binding: manager::VfioDeviceBinding, + /// VFIO binding. Keeps the container/group (legacy) or iommufd/IOAS + /// (cdev) fds alive and cleans up on drop. + binding: manager::VfioBinding, } impl VfioAssignedPciDevice { @@ -231,6 +231,45 @@ impl VfioAssignedPciDevice { .await .with_context(|| format!("failed to open VFIO device {pci_id}"))?; + Self::from_device( + vfio_device, + manager::VfioBinding::Group(binding), + pci_id, + register_mmio, + msi_target, + memory_mapper, + ) + .await + } + + /// Create from a pre-opened VFIO device and a cdev binding. + pub async fn from_cdev( + cdev_binding: manager::VfioCdevBinding, + pci_id: String, + register_mmio: &mut (dyn chipset_device::mmio::RegisterMmioIntercept + Send), + msi_target: &MsiTarget, + memory_mapper: &dyn MemoryMapper, + ) -> anyhow::Result { + let (device, binding) = cdev_binding.into_parts(); + Self::from_device( + device, + manager::VfioBinding::Cdev(binding), + pci_id, + register_mmio, + msi_target, + memory_mapper, + ) + .await + } + + async fn from_device( + vfio_device: vfio_sys::Device, + binding: manager::VfioBinding, + pci_id: String, + register_mmio: &mut (dyn chipset_device::mmio::RegisterMmioIntercept + Send), + msi_target: &MsiTarget, + memory_mapper: &dyn MemoryMapper, + ) -> anyhow::Result { let config_info = vfio_device .region_info(vfio_bindings::bindings::vfio::VFIO_PCI_CONFIG_REGION_INDEX) .context("failed to get VFIO config region info")?; diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 1385324fc5..41b5c176c7 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -15,8 +15,10 @@ use inspect::{Inspect, InspectMut}; use membacking::DmaMapperClient; use mesh::rpc::FailableRpc; use mesh::rpc::RpcSend as _; +use pal_async::task::Spawn as _; use std::collections::HashMap; use std::fs::File; +use std::os::unix::prelude::*; use std::sync::Arc; /// Implements [`membacking::DmaTarget`] for VFIO type1 IOMMU containers. @@ -143,17 +145,12 @@ pub(crate) struct VfioContainerManager { /// /// Inspecting this sends a deferred inspect request to the container manager /// task, which reports the container/group/device topology. -#[derive(Clone)] +#[derive(Clone, Inspect)] pub struct VfioManagerClient { + #[inspect(flatten, send = "VfioManagerRpc::Inspect")] sender: mesh::Sender, } -impl Inspect for VfioManagerClient { - fn inspect(&self, req: inspect::Request<'_>) { - self.sender.send(VfioManagerRpc::Inspect(req.defer())); - } -} - impl VfioManagerClient { pub(crate) async fn prepare_device( &self, @@ -456,3 +453,481 @@ impl VfioContainerManager { } } } + +// --- iommufd / cdev support --- + +/// Implements [`membacking::DmaTarget`] for iommufd IOAS-based DMA mapping. +/// +/// Like `VfioType1DmaTarget`, this uses host virtual addresses for mapping, +/// but calls `IOMMU_IOAS_MAP`/`IOMMU_IOAS_UNMAP` on the iommufd fd instead +/// of `VFIO_IOMMU_MAP_DMA`/`VFIO_IOMMU_UNMAP_DMA` on a VFIO container fd. +struct IommufdDmaTarget { + ctx: Arc, + ioas_id: u32, +} + +impl membacking::DmaTarget for IommufdDmaTarget { + unsafe fn map_dma( + &self, + range: memory_range::MemoryRange, + host_va: Option<*const u8>, + _mappable: &membacking::Mappable, + _file_offset: u64, + ) -> anyhow::Result<()> { + let vaddr = + host_va.expect("iommufd IOAS map requires host VA (registered with needs_va=true)"); + let iova = range.start(); + let user_va = vaddr as u64; + let length = range.len(); + // SAFETY: The caller (DmaMapper in membacking) guarantees that the + // host VA is backed and stable via ensure_mapped + VaMapper lifetime. + unsafe { + self.ctx + .ioas_map(self.ioas_id, iova, user_va, length) + .with_context(|| { + format!( + "iommufd IOAS DMA map failed: iova={iova:#x} user_va={user_va:#x} \ + length={length:#x} ioas_id={}", + self.ioas_id + ) + }) + } + } + + fn unmap_dma(&self, range: memory_range::MemoryRange) -> anyhow::Result<()> { + let _span = tracing::info_span!("iommufd unmap", %range).entered(); + self.ctx + .ioas_unmap(self.ioas_id, range.start(), range.len()) + .context("iommufd IOAS DMA unmap failed")?; + Ok(()) + } +} + +// --- Per-iommu-context manager (IoasManager) --- + +/// RPC messages for a per-iommu [`IoasManager`] task. +pub(crate) enum IoasManagerRpc { + /// Bind and attach a cdev device to this manager's IOAS. + PrepareDevice { + pci_id: String, + cdev: File, + /// The response half of the original RPC from the resolver. + respond: FailableRpc<(), CdevPrepareResponse>, + }, + /// Notify that a device has been dropped. + RemoveDevice(u64), + /// Inspect. + Inspect(inspect::Deferred), +} + +/// Manages a single iommufd IOAS context for one `--iommu` instance. +/// +/// Each `--iommu id=` gets its own `IoasManager` task, which owns +/// the iommufd context, IOAS, and DMA mapper registration. Devices +/// referencing the same `--iommu` ID share one IOAS — one set of IOMMU +/// page tables, one DMA mapper registration. Devices on different +/// `--iommu` IDs are handled by separate `IoasManager` tasks concurrently. +#[derive(Inspect)] +struct IoasManager { + iommu_id: String, + #[inspect(skip)] + ctx: Arc, + ioas_id: u32, + /// Keeps the DMA mapper registered with the region manager. + #[inspect(skip)] + _dma_handle: membacking::DmaMapperHandle, + /// Active devices on this IOAS. + #[inspect(with = "|x| inspect::iter_by_key(x.iter().map(|d| (&d.pci_id, ())))")] + devices: Vec, + /// Next device ID (unique within this manager). + #[inspect(skip)] + next_device_id: u64, + #[inspect(skip)] + recv: mesh::Receiver, +} + +/// Tracks a cdev device for inspect and cleanup. +struct CdevDeviceEntry { + id: u64, + pci_id: String, +} + +impl IoasManager { + /// Create and initialize a new per-iommu manager. + /// + /// Allocates an IOAS on the given iommufd fd and registers it with + /// the region manager for DMA mapping. + async fn new( + iommu_id: String, + iommufd: File, + dma_mapper_client: &DmaMapperClient, + recv: mesh::Receiver, + ) -> anyhow::Result { + let ctx = Arc::new(vfio_sys::iommufd::IommufdCtx::from_file(iommufd)); + let ioas_id = ctx + .ioas_alloc() + .context("failed to allocate iommufd IOAS")?; + + let dma_target: Arc = Arc::new(IommufdDmaTarget { + ctx: ctx.clone(), + ioas_id, + }); + let dma_handle = dma_mapper_client + .add_dma_mapper(dma_target, true) + .await + .context("failed to register iommufd IOAS with region manager")?; + + tracing::info!(iommu_id, ioas_id, "created iommufd IOAS for iommu context"); + + Ok(Self { + iommu_id, + ctx, + ioas_id, + _dma_handle: dma_handle, + devices: Vec::new(), + next_device_id: 0, + recv, + }) + } + + /// Run the per-iommu manager task, processing RPCs until the channel + /// closes. + async fn run(mut self) { + while let Ok(rpc) = self.recv.recv().await { + match rpc { + IoasManagerRpc::PrepareDevice { + pci_id, + cdev, + respond, + } => { + respond + .handle_failable(async |()| self.prepare_device(pci_id, cdev)) + .await + } + IoasManagerRpc::RemoveDevice(device_id) => { + self.remove_device(device_id); + } + IoasManagerRpc::Inspect(deferred) => deferred.inspect(&self), + } + } + } + + fn prepare_device( + &mut self, + pci_id: String, + cdev_file: File, + ) -> anyhow::Result { + let cdev = vfio_sys::cdev::CdevDevice::from_file(cdev_file); + + // Bind the cdev device to this iommu context's iommufd. + let devid = cdev + .bind_iommufd(self.ctx.as_raw_fd()) + .context("failed to bind VFIO cdev to iommufd")?; + + // Attach the device to the shared IOAS. + cdev.attach_ioas(self.ioas_id) + .context("failed to attach cdev device to IOAS")?; + + let device_id = self.next_device_id; + self.next_device_id += 1; + + self.devices.push(CdevDeviceEntry { + id: device_id, + pci_id: pci_id.clone(), + }); + + tracing::info!( + pci_id, + iommu_id = self.iommu_id, + iommufd_devid = devid, + ioas_id = self.ioas_id, + device_id, + "VFIO cdev device attached to IOAS" + ); + + Ok(CdevPrepareResponse { + device: cdev.into_device(), + iommufd_devid: devid, + ioas_id: self.ioas_id, + device_id, + manager_send: self.recv.sender(), + }) + } + + fn remove_device(&mut self, device_id: u64) { + if let Some(pos) = self.devices.iter().position(|d| d.id == device_id) { + let entry = self.devices.swap_remove(pos); + tracing::info!( + device_id, + pci_id = entry.pci_id, + iommu_id = self.iommu_id, + "removing cdev device" + ); + } + } +} + +// --- Cdev dispatcher (VfioCdevManager) --- + +/// RPC messages for the cdev dispatcher. +pub(crate) enum VfioCdevManagerRpc { + /// Bind a cdev device to an IOAS, spawning a per-iommu manager if + /// this is the first device for the given iommu ID. + PrepareDevice(FailableRpc), + /// Inspect. + Inspect(inspect::Deferred), +} + +/// Request payload for `PrepareDevice`. +pub(crate) struct CdevPrepareRequest { + pub pci_id: String, + pub cdev: File, + pub iommufd: File, + pub iommu_id: String, +} + +/// Response payload for `PrepareDevice`. +pub(crate) struct CdevPrepareResponse { + pub device: vfio_sys::Device, + pub iommufd_devid: u32, + pub ioas_id: u32, + pub device_id: u64, + /// Sender to the per-iommu manager for drop notification. + pub manager_send: mesh::Sender, +} + +/// Dispatches cdev device requests to per-iommu [`IoasManager`] tasks. +/// +/// Unlike the legacy [`VfioContainerManager`] which makes cross-device +/// sharing decisions, the cdev dispatcher simply routes each device to +/// the manager for its `--iommu` ID. Each per-iommu manager runs as a +/// separate task, so devices on different `--iommu` contexts are +/// prepared concurrently. +pub(crate) struct VfioCdevManager { + /// Per-iommu manager senders, keyed by `--iommu` ID. + managers: HashMap>, + /// DMA mapper client, cloned for each new per-iommu manager. + dma_mapper_client: DmaMapperClient, + /// Spawner for per-iommu manager tasks. + spawner: Arc, + /// Per-iommu manager tasks (kept alive). + tasks: Vec>, + recv: mesh::Receiver, +} + +/// Client handle for the `VfioCdevManager` dispatcher. +#[derive(Clone, Inspect)] +pub struct VfioCdevManagerClient { + #[inspect(flatten, send = "VfioCdevManagerRpc::Inspect")] + sender: mesh::Sender, +} + +impl VfioCdevManagerClient { + pub(crate) async fn prepare_device( + &self, + req: CdevPrepareRequest, + ) -> anyhow::Result { + Ok(self + .sender + .call_failable(VfioCdevManagerRpc::PrepareDevice, req) + .await?) + } +} + +impl VfioCdevManager { + /// Create a new cdev dispatcher. + pub fn new( + spawner: Arc, + dma_mapper_client: DmaMapperClient, + ) -> Self { + Self { + managers: HashMap::new(), + dma_mapper_client, + spawner, + tasks: Vec::new(), + recv: mesh::Receiver::new(), + } + } + + /// Run the dispatcher, routing device requests to per-iommu managers. + pub async fn run(mut self) { + while let Ok(rpc) = self.recv.recv().await { + match rpc { + VfioCdevManagerRpc::PrepareDevice(rpc) => { + let (req, respond) = rpc.split(); + self.route_prepare(req, respond).await; + } + VfioCdevManagerRpc::Inspect(deferred) => { + deferred.respond(|resp| { + for (iommu_id, sender) in &self.managers { + resp.child(iommu_id, |req| { + sender.send(IoasManagerRpc::Inspect(req.defer())); + }); + } + }); + } + } + } + } + + /// Route a prepare request to the per-iommu manager, spawning one + /// if needed. Initializes the per-iommu manager inline on first use + /// so that init failures are reported directly to the caller. + /// + /// The actual bind/attach ioctls are forwarded to the per-iommu + /// manager task via fire-and-forget send, so the dispatcher is + /// immediately free to handle the next request. This allows devices + /// on different `--iommu` contexts to be prepared concurrently. + async fn route_prepare( + &mut self, + req: CdevPrepareRequest, + respond: FailableRpc<(), CdevPrepareResponse>, + ) { + let CdevPrepareRequest { + pci_id, + cdev, + iommufd, + iommu_id, + } = req; + + let sender = match self.managers.entry(iommu_id.clone()) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + let mut ioas_recv: mesh::Receiver = mesh::Receiver::new(); + let sender = ioas_recv.sender(); + + let mgr = match IoasManager::new( + iommu_id.clone(), + iommufd, + &self.dma_mapper_client, + ioas_recv, + ) + .await + .with_context(|| { + format!("failed to initialize iommufd IOAS manager for iommu={iommu_id}") + }) { + Ok(mgr) => mgr, + Err(e) => { + respond.fail(e); + return; + } + }; + + let task = self + .spawner + .spawn(format!("vfio-ioas-{iommu_id}"), mgr.run()); + self.tasks.push(task); + e.insert(sender) + } + }; + + // Forward to the per-iommu manager task. The manager will + // complete the respond half after the bind/attach ioctls. + sender.send(IoasManagerRpc::PrepareDevice { + pci_id, + cdev, + respond, + }); + } + + pub(crate) fn client(&mut self) -> VfioCdevManagerClient { + VfioCdevManagerClient { + sender: self.recv.sender(), + } + } +} + +/// Binding for a VFIO device opened via the cdev + iommufd path. +/// +/// Analogous to [`VfioDeviceBinding`] for the legacy group path. +/// Notifies the per-iommu manager on drop so device counts stay accurate. +#[derive(Inspect)] +pub(crate) struct VfioCdevBinding { + /// PCI BDF address on the host. + pci_id: String, + /// VFIO cdev device — provides config space, BAR, IRQ ioctls. + #[inspect(skip)] + device: vfio_sys::Device, + /// iommufd device ID (from `VFIO_DEVICE_BIND_IOMMUFD`). + iommufd_devid: u32, + /// IOAS ID this device is attached to. + ioas_id: u32, + /// Device ID assigned by the per-iommu manager (for drop notification). + #[inspect(skip)] + device_id: u64, + /// Sender to the per-iommu manager for drop notification. + #[inspect(skip)] + manager_send: mesh::Sender, +} + +impl VfioCdevBinding { + /// Create from a dispatcher response. + pub(crate) fn from_response(resp: CdevPrepareResponse, pci_id: String) -> Self { + Self { + pci_id, + device: resp.device, + iommufd_devid: resp.iommufd_devid, + ioas_id: resp.ioas_id, + device_id: resp.device_id, + manager_send: resp.manager_send, + } + } + + /// Consume the binding and split into the `Device` (for constructing + /// `VfioAssignedPciDevice`) and the remaining binding state (for + /// lifetime management). The state's `Drop` impl notifies the per-iommu + /// manager when the device is released. + pub fn into_parts(self) -> (vfio_sys::Device, VfioCdevBindingState) { + let Self { + pci_id, + device, + iommufd_devid, + ioas_id, + device_id, + manager_send, + } = self; + ( + device, + VfioCdevBindingState { + pci_id, + iommufd_devid, + ioas_id, + device_id, + manager_send, + }, + ) + } +} + +/// The iommufd-related state from a [`VfioCdevBinding`], kept alive for +/// the lifetime of the assigned device. +/// +/// Notifies the per-iommu manager on drop so device counts are accurate. +#[derive(Inspect)] +pub(crate) struct VfioCdevBindingState { + pci_id: String, + iommufd_devid: u32, + ioas_id: u32, + #[inspect(skip)] + device_id: u64, + #[inspect(skip)] + manager_send: mesh::Sender, +} + +impl Drop for VfioCdevBindingState { + fn drop(&mut self) { + self.manager_send + .send(IoasManagerRpc::RemoveDevice(self.device_id)); + } +} + +/// Wrapper enum for either legacy group or cdev iommufd binding. +/// +/// Kept as a field on `VfioAssignedPciDevice` to hold the underlying +/// fd/handle resources alive for the device's lifetime. +#[derive(Inspect)] +#[inspect(external_tag)] +pub(crate) enum VfioBinding { + Group(VfioDeviceBinding), + Cdev(VfioCdevBindingState), +} diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index 089744e51e..3310bac5ef 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -9,8 +9,11 @@ use crate::manager::VfioManagerClient; use anyhow::Context as _; use async_trait::async_trait; use membacking::DmaMapperClient; +use pal_async::task::Spawn as _; use pci_resources::ResolvePciDeviceHandleParams; use pci_resources::ResolvedPciDevice; +use std::sync::Arc; +use vfio_assigned_device_resources::VfioCdevDeviceHandle; use vfio_assigned_device_resources::VfioDeviceHandle; use vm_resource::AsyncResolveResource; use vm_resource::ResourceResolver; @@ -87,3 +90,85 @@ impl AsyncResolveResource for VfioDeviceR Ok(device.into()) } } + +/// Resource resolver for [`VfioCdevDeviceHandle`] (cdev + iommufd path). +/// +/// Spawns a `VfioCdevManager` task internally and communicates with it via RPC +/// to share IOAS contexts across devices referencing the same iommu ID. +pub struct VfioCdevDeviceResolver { + client: crate::manager::VfioCdevManagerClient, + _task: pal_async::task::Task<()>, +} + +impl VfioCdevDeviceResolver { + /// Create a new cdev resolver, spawning the cdev dispatcher task. + pub fn new( + spawner: impl pal_async::task::Spawn + 'static, + dma_mapper_client: DmaMapperClient, + ) -> Self { + // Arc the spawner so the dispatcher can spawn per-iommu manager tasks. + let spawner: Arc = Arc::new(spawner); + let mut manager = crate::manager::VfioCdevManager::new(spawner.clone(), dma_mapper_client); + let client = manager.client(); + let task = spawner.spawn("vfio-cdev-dispatch", manager.run()); + Self { + client, + _task: task, + } + } + + /// Returns a handle for the VM's inspect tree. + pub fn inspect_handle(&self) -> crate::manager::VfioCdevManagerClient { + self.client.clone() + } +} + +#[async_trait] +impl AsyncResolveResource for VfioCdevDeviceResolver { + type Output = ResolvedPciDevice; + type Error = anyhow::Error; + + async fn resolve( + &self, + _resolver: &ResourceResolver, + resource: VfioCdevDeviceHandle, + input: ResolvePciDeviceHandleParams<'_>, + ) -> Result { + let VfioCdevDeviceHandle { + pci_id, + cdev, + iommufd, + iommu_id, + } = resource; + + tracing::info!(pci_id, iommu_id, "opening VFIO cdev device with iommufd"); + + let resp = self + .client + .prepare_device(crate::manager::CdevPrepareRequest { + pci_id: pci_id.clone(), + cdev, + iommufd, + iommu_id, + }) + .await + .context("VFIO cdev manager failed")?; + + let cdev_binding = crate::manager::VfioCdevBinding::from_response(resp, pci_id.clone()); + + let memory_mapper = input + .shared_mem_mapper + .context("memory mapper is required for VFIO device assignment")?; + + let device = VfioAssignedPciDevice::from_cdev( + cdev_binding, + pci_id, + input.register_mmio, + input.msi_target, + memory_mapper, + ) + .await?; + + Ok(device.into()) + } +} diff --git a/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs b/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs index e0b6875a20..744e4818fe 100644 --- a/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device_resources/src/lib.rs @@ -10,7 +10,7 @@ use std::fs::File; use vm_resource::ResourceId; use vm_resource::kind::PciDeviceHandleKind; -/// A handle to a VFIO-assigned PCI device. +/// A handle to a VFIO-assigned PCI device (legacy group path). /// /// The launcher opens the VFIO group file descriptor (e.g., `/dev/vfio/N`) /// and passes it here so that the VMM process does not need direct access @@ -26,3 +26,26 @@ pub struct VfioDeviceHandle { impl ResourceId for VfioDeviceHandle { const ID: &'static str = "vfio"; } + +/// A handle to a VFIO-assigned PCI device (cdev + iommufd path). +/// +/// The launcher opens the VFIO cdev file descriptor +/// (e.g., `/dev/vfio/devices/vfio0`) and the iommufd file descriptor +/// (`/dev/iommu`) and passes them here. The VMM binds the device to the +/// iommufd instance and attaches an IOAS for DMA mapping. +#[derive(MeshPayload)] +pub struct VfioCdevDeviceHandle { + /// PCI BDF address on the host (e.g., "0000:3f:7a.0"). + pub pci_id: String, + /// Pre-opened VFIO cdev file descriptor (`/dev/vfio/devices/vfioN`). + pub cdev: File, + /// Pre-opened iommufd file descriptor (`/dev/iommu`). + pub iommufd: File, + /// The `--iommu` context ID this device belongs to. All devices + /// sharing the same ID share a single IOAS (one set of page tables). + pub iommu_id: String, +} + +impl ResourceId for VfioCdevDeviceHandle { + const ID: &'static str = "vfio-cdev"; +} diff --git a/vm/devices/user_driver/vfio_sys/src/cdev.rs b/vm/devices/user_driver/vfio_sys/src/cdev.rs new file mode 100644 index 0000000000..c8e8987199 --- /dev/null +++ b/vm/devices/user_driver/vfio_sys/src/cdev.rs @@ -0,0 +1,167 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VFIO cdev (per-device fd) support. +//! +//! VFIO cdev is the modern device-access interface (`/dev/vfio/devices/vfioN`) +//! that replaces the legacy group/container model. Each device gets its own +//! character device node. The device is bound to an iommufd instance via +//! `VFIO_DEVICE_BIND_IOMMUFD`, and DMA is configured by attaching an iommufd +//! IOAS or HWPT via `VFIO_DEVICE_ATTACH_IOMMUFD_PT`. +//! +//! Once bound and attached, the device fd supports the same `VFIO_DEVICE_*` +//! ioctls as the legacy group path (get_info, get_region_info, set_irqs, +//! reset, mmap). The [`CdevDevice`] type wraps the fd and provides these +//! operations, producing a [`super::Device`] for the common ioctl surface. + +use anyhow::Context as _; +use std::fs; +use std::os::unix::prelude::*; + +mod ioctl { + use nix::request_code_none; + use vfio_bindings::bindings::vfio::VFIO_BASE; + use vfio_bindings::bindings::vfio::VFIO_TYPE; + + // VFIO_DEVICE_BIND_IOMMUFD = _IO(VFIO_TYPE, VFIO_BASE + 18) + nix::ioctl_readwrite_bad!( + vfio_device_bind_iommufd, + request_code_none!(VFIO_TYPE, VFIO_BASE + 18), + super::VfioDeviceBindIommufd + ); + + // VFIO_DEVICE_ATTACH_IOMMUFD_PT = _IO(VFIO_TYPE, VFIO_BASE + 19) + nix::ioctl_readwrite_bad!( + vfio_device_attach_iommufd_pt, + request_code_none!(VFIO_TYPE, VFIO_BASE + 19), + super::VfioDeviceAttachIommufdPt + ); + + // VFIO_DEVICE_DETACH_IOMMUFD_PT = _IO(VFIO_TYPE, VFIO_BASE + 20) + nix::ioctl_readwrite_bad!( + vfio_device_detach_iommufd_pt, + request_code_none!(VFIO_TYPE, VFIO_BASE + 20), + super::VfioDeviceDetachIommufdPt + ); +} + +// Kernel ABI structs — must match `include/uapi/linux/vfio.h` exactly. + +#[repr(C)] +pub struct VfioDeviceBindIommufd { + pub argsz: u32, + pub flags: u32, + pub iommufd: i32, + pub out_devid: u32, +} + +#[repr(C)] +pub struct VfioDeviceAttachIommufdPt { + pub argsz: u32, + pub flags: u32, + pub pt_id: u32, +} + +#[repr(C)] +pub struct VfioDeviceDetachIommufdPt { + pub argsz: u32, + pub flags: u32, +} + +/// A VFIO device opened via the cdev interface (`/dev/vfio/devices/vfioN`). +/// +/// This is the modern per-device access path. After opening, the device must +/// be bound to an iommufd fd via [`bind_iommufd`](Self::bind_iommufd) and +/// then attached to an IOAS or HWPT via [`attach_ioas`](Self::attach_ioas) +/// before any DMA can occur. +/// +/// Once bound and attached, call [`into_device`](Self::into_device) to get +/// the standard [`Device`](super::Device) for config space, BAR, IRQ, and +/// mmap operations. +pub struct CdevDevice { + file: fs::File, +} + +impl CdevDevice { + /// Wrap a pre-opened VFIO cdev file descriptor. + pub fn from_file(file: fs::File) -> Self { + Self { file } + } + + /// Bind this device to an iommufd instance. + /// + /// Returns the kernel-assigned device ID within the iommufd context. + /// This must be called before any DMA operations. + pub fn bind_iommufd(&self, iommufd_fd: RawFd) -> anyhow::Result { + let mut cmd = VfioDeviceBindIommufd { + argsz: size_of::() as u32, + flags: 0, + iommufd: iommufd_fd, + out_devid: 0, + }; + // SAFETY: Both fds are valid, struct correctly constructed. + unsafe { + ioctl::vfio_device_bind_iommufd(self.file.as_raw_fd(), &mut cmd) + .context("VFIO_DEVICE_BIND_IOMMUFD failed")?; + } + Ok(cmd.out_devid) + } + + /// Attach the device to an IOAS or HWPT by its iommufd object ID. + /// + /// Pass an IOAS ID for identity DMA translation, or a HWPT ID for + /// nested translation. + /// + /// Returns the attached page table ID (may differ from input if the + /// kernel auto-created a HWPT for the IOAS). + pub fn attach_ioas(&self, pt_id: u32) -> anyhow::Result { + let mut cmd = VfioDeviceAttachIommufdPt { + argsz: size_of::() as u32, + flags: 0, + pt_id, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::vfio_device_attach_iommufd_pt(self.file.as_raw_fd(), &mut cmd) + .context("VFIO_DEVICE_ATTACH_IOMMUFD_PT failed")?; + } + Ok(cmd.pt_id) + } + + /// Detach the device from its current IOAS/HWPT. + /// + /// After detaching, the device is in a blocking DMA state. + pub fn detach_ioas(&self) -> anyhow::Result<()> { + let mut cmd = VfioDeviceDetachIommufdPt { + argsz: size_of::() as u32, + flags: 0, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::vfio_device_detach_iommufd_pt(self.file.as_raw_fd(), &mut cmd) + .context("VFIO_DEVICE_DETACH_IOMMUFD_PT failed")?; + } + Ok(()) + } + + /// Convert to a standard [`Device`](super::Device) for config space, + /// BAR, IRQ, and mmap operations. + /// + /// The cdev fd supports the same `VFIO_DEVICE_*` ioctls as the legacy + /// group path, so the [`Device`](super::Device) type works unchanged. + pub fn into_device(self) -> super::Device { + super::Device { file: self.file } + } +} + +impl AsRef for CdevDevice { + fn as_ref(&self) -> &fs::File { + &self.file + } +} + +impl AsFd for CdevDevice { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} diff --git a/vm/devices/user_driver/vfio_sys/src/iommufd.rs b/vm/devices/user_driver/vfio_sys/src/iommufd.rs new file mode 100644 index 0000000000..56cc0643f0 --- /dev/null +++ b/vm/devices/user_driver/vfio_sys/src/iommufd.rs @@ -0,0 +1,222 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Bindings for the Linux iommufd subsystem (`/dev/iommu`). +//! +//! Provides safe wrappers around `IOMMU_IOAS_ALLOC`, `IOMMU_IOAS_MAP`, +//! `IOMMU_IOAS_UNMAP`, and `IOMMU_DESTROY` ioctls, which together support +//! identity DMA mapping via an IOAS. + +use anyhow::Context as _; +use std::fs; +use std::os::unix::prelude::*; + +/// iommufd ioctl type character (';' = 0x3B). +const IOMMUFD_TYPE: u8 = b';'; + +/// Base command number for iommufd ioctls. +const IOMMUFD_CMD_BASE: u8 = 0x80; + +// Command numbers (IOMMUFD_CMD_BASE + offset). +const IOMMUFD_CMD_DESTROY: u8 = IOMMUFD_CMD_BASE; +const IOMMUFD_CMD_IOAS_ALLOC: u8 = IOMMUFD_CMD_BASE + 1; +const IOMMUFD_CMD_IOAS_MAP: u8 = IOMMUFD_CMD_BASE + 5; +const IOMMUFD_CMD_IOAS_UNMAP: u8 = IOMMUFD_CMD_BASE + 6; + +/// Flags for `IOMMU_IOAS_MAP`. +pub const IOMMU_IOAS_MAP_FIXED_IOVA: u32 = 1 << 0; +pub const IOMMU_IOAS_MAP_WRITEABLE: u32 = 1 << 1; +pub const IOMMU_IOAS_MAP_READABLE: u32 = 1 << 2; + +mod ioctl { + use nix::request_code_none; + + // IOMMUFD ioctls use _IO (no direction, just type + nr). + // The kernel defines them as _IO(IOMMUFD_TYPE, cmd_nr). + nix::ioctl_readwrite_bad!( + iommu_destroy, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_DESTROY as u32 + ), + super::IommuDestroy + ); + nix::ioctl_readwrite_bad!( + iommu_ioas_alloc, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_IOAS_ALLOC as u32 + ), + super::IommuIoasAlloc + ); + nix::ioctl_readwrite_bad!( + iommu_ioas_map, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_IOAS_MAP as u32 + ), + super::IommuIoasMap + ); + nix::ioctl_readwrite_bad!( + iommu_ioas_unmap, + request_code_none!( + super::IOMMUFD_TYPE as u32, + super::IOMMUFD_CMD_IOAS_UNMAP as u32 + ), + super::IommuIoasUnmap + ); +} + +// Kernel ABI structs — must match `include/uapi/linux/iommufd.h` exactly. + +#[repr(C)] +pub struct IommuDestroy { + pub size: u32, + pub id: u32, +} + +#[repr(C)] +pub struct IommuIoasAlloc { + pub size: u32, + pub flags: u32, + pub out_ioas_id: u32, +} + +#[repr(C)] +pub struct IommuIoasMap { + pub size: u32, + pub flags: u32, + pub ioas_id: u32, + pub __reserved: u32, + pub user_va: u64, + pub length: u64, + pub iova: u64, +} + +#[repr(C)] +pub struct IommuIoasUnmap { + pub size: u32, + pub ioas_id: u32, + pub iova: u64, + pub length: u64, +} + +/// An open iommufd file descriptor (`/dev/iommu`). +/// +/// Wraps the fd and provides safe methods for the iommufd ioctls needed +/// to allocate an IOAS and map/unmap host memory into it. +pub struct IommufdCtx { + file: fs::File, +} + +impl IommufdCtx { + /// Open `/dev/iommu` and return a new iommufd context. + pub fn new() -> anyhow::Result { + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .open("/dev/iommu") + .context("failed to open /dev/iommu")?; + Ok(Self { file }) + } + + /// Wrap an existing iommufd file descriptor. + pub fn from_file(file: fs::File) -> Self { + Self { file } + } + + /// Allocate a new IO Address Space (IOAS). + /// + /// Returns the kernel-assigned IOAS object ID. + pub fn ioas_alloc(&self) -> anyhow::Result { + let mut cmd = IommuIoasAlloc { + size: size_of::() as u32, + flags: 0, + out_ioas_id: 0, + }; + // SAFETY: fd is valid, struct is correctly sized and zeroed. + unsafe { + ioctl::iommu_ioas_alloc(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_IOAS_ALLOC failed")?; + } + Ok(cmd.out_ioas_id) + } + + /// Map a user VA range into an IOAS at a fixed IOVA. + /// + /// `ioas_id` is the IOAS to map into. `iova` is the fixed IO virtual + /// address. `user_va` is the host virtual address of the backing memory. + /// `length` is the size in bytes (must be page-aligned). + /// + /// # Safety + /// `user_va` must point to valid, backed memory for `length` bytes. + /// The memory must remain mapped for the lifetime of this IOAS mapping. + pub unsafe fn ioas_map( + &self, + ioas_id: u32, + iova: u64, + user_va: u64, + length: u64, + ) -> anyhow::Result<()> { + let mut cmd = IommuIoasMap { + size: size_of::() as u32, + flags: IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_READABLE | IOMMU_IOAS_MAP_WRITEABLE, + ioas_id, + __reserved: 0, + user_va, + length, + iova, + }; + // SAFETY: fd is valid, struct correctly constructed. Caller + // guarantees user_va is backed and stable. + unsafe { + ioctl::iommu_ioas_map(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_IOAS_MAP failed")?; + } + Ok(()) + } + + /// Unmap an IOVA range from an IOAS. + /// + /// Returns the number of bytes actually unmapped. + pub fn ioas_unmap(&self, ioas_id: u32, iova: u64, length: u64) -> anyhow::Result { + let mut cmd = IommuIoasUnmap { + size: size_of::() as u32, + ioas_id, + iova, + length, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::iommu_ioas_unmap(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_IOAS_UNMAP failed")?; + } + Ok(cmd.length) + } + + /// Destroy an iommufd object by its ID. + pub fn destroy(&self, id: u32) -> anyhow::Result<()> { + let mut cmd = IommuDestroy { + size: size_of::() as u32, + id, + }; + // SAFETY: fd is valid, struct correctly constructed. + unsafe { + ioctl::iommu_destroy(self.file.as_raw_fd(), &mut cmd) + .context("IOMMU_DESTROY failed")?; + } + Ok(()) + } +} + +impl AsFd for IommufdCtx { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} + +impl AsRawFd for IommufdCtx { + fn as_raw_fd(&self) -> RawFd { + self.file.as_raw_fd() + } +} diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index 0af03092b3..abe6fc4032 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -6,6 +6,9 @@ // UNSAFETY: Manual memory management with mmap and vfio ioctls. #![expect(unsafe_code)] +pub mod cdev; +pub mod iommufd; + use anyhow::Context; use bitfield_struct::bitfield; use headervec::HeaderVec; @@ -213,14 +216,6 @@ impl Container { ioctl::vfio_iommu_unmap_dma(self.file.as_raw_fd(), &mut dma_unmap) .context("VFIO_IOMMU_UNMAP_DMA failed")?; } - if dma_unmap.size != size { - tracing::warn!( - iova, - requested = size, - actual = dma_unmap.size, - "VFIO_IOMMU_UNMAP_DMA: unmapped size differs from requested" - ); - } Ok(()) } }