diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b2a76e523d..e06b83af00a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,14 @@ and this project adheres to ### Added +- [#5896](https://github.com/firecracker-microvm/firecracker/pull/5896): Add + support for overriding virtio-pmem device backing file paths on snapshot + restore via the new `pmem_overrides` field on the `PUT /snapshot/load` API. + This mirrors the existing network and vsock override mechanisms and is useful + when the host file path baked into the snapshot is no longer valid (for + example, when restoring on a different host or under a different jailer + chroot). + ### Changed ### Deprecated diff --git a/docs/pmem.md b/docs/pmem.md index 7629d9c1433..324706a5bfa 100644 --- a/docs/pmem.md +++ b/docs/pmem.md @@ -260,6 +260,38 @@ same backing file as it was configured in the first place. This means all `virtio-pmem` backing files should be present in the same locations during restore as they were during initial `virtio-pmem` configuration. +### Overriding pmem backing file paths on restore + +When the host file path baked into the snapshot is no longer valid - for +example, when restoring on a different host or under a different jailer chroot - +the `pmem_overrides` parameter of the snapshot load API can be used to point +each `virtio-pmem` device at the same backing file in its new location. + +This is only intended for relocating the original backing file; it is not a +mechanism for swapping in a different backing file on restore. The overriding +file must therefore be the same backing file (and hence the same size) as the +one used when the snapshot was taken. + +```bash +curl --unix-socket /tmp/firecracker.socket -i \ + -X PUT 'http://localhost/snapshot/load' \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "snapshot_path": "./snapshot_file", + "mem_backend": { + "backend_path": "./mem_file", + "backend_type": "File" + }, + "pmem_overrides": [ + { + "id": "pmem0", + "path_on_host": "/new/path/to/pmem0.img" + } + ] + }' +``` + ## Performance Even though `virtio-pmem` allows for the direct access of host pages from the diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index e6640ac3390..3bffac9834e 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -112,6 +112,7 @@ fn parse_put_snapshot_load(body: &Body) -> Result { network_overrides: snapshot_config.network_overrides, vsock_override: snapshot_config.vsock_override, clock_realtime: snapshot_config.clock_realtime, + pmem_overrides: snapshot_config.pmem_overrides, }; // Construct the `ParsedRequest` object. @@ -127,7 +128,9 @@ fn parse_put_snapshot_load(body: &Body) -> Result { #[cfg(test)] mod tests { - use vmm::vmm_config::snapshot::{MemBackendConfig, MemBackendType, NetworkOverride}; + use vmm::vmm_config::snapshot::{ + MemBackendConfig, MemBackendType, NetworkOverride, PmemOverride, + }; use super::*; use crate::api_server::parsed_request::tests::{depr_action_from_req, vmm_action_from_request}; @@ -191,6 +194,7 @@ mod tests { network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -223,6 +227,7 @@ mod tests { network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -255,6 +260,7 @@ mod tests { network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -296,6 +302,49 @@ mod tests { }], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], + }; + let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); + assert!( + parsed_request + .parsing_info() + .take_deprecation_message() + .is_none() + ); + assert_eq!( + vmm_action_from_request(parsed_request), + VmmAction::LoadSnapshot(expected_config) + ); + + let body = r#"{ + "snapshot_path": "foo", + "mem_backend": { + "backend_path": "bar", + "backend_type": "File" + }, + "resume_vm": true, + "pmem_overrides": [ + { + "id": "pmem0", + "path_on_host": "/new/path/pmem0.img" + } + ] + }"#; + let expected_config = LoadSnapshotParams { + snapshot_path: PathBuf::from("foo"), + mem_backend: MemBackendConfig { + backend_path: PathBuf::from("bar"), + backend_type: MemBackendType::File, + }, + track_dirty_pages: false, + resume_vm: true, + network_overrides: vec![], + vsock_override: None, + clock_realtime: false, + pmem_overrides: vec![PmemOverride { + id: String::from("pmem0"), + path_on_host: String::from("/new/path/pmem0.img"), + }], }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -325,6 +374,7 @@ mod tests { network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], }; let parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert_eq!( diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index d1ac91bdb6a..7d8fbf90a78 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1655,6 +1655,26 @@ definitions: description: The new path for the backing Unix Domain Socket. + PmemOverride: + type: object + description: + Allows for changing the backing host file of a virtio-pmem device + during snapshot restore. + required: + - id + - path_on_host + properties: + id: + type: string + description: + The ID of the pmem device to modify. + path_on_host: + type: string + description: + The new host path for the pmem device's backing file. The file must + be at least as large as the backing file used when the snapshot was + taken. + SnapshotLoadParams: type: object description: @@ -1710,6 +1730,11 @@ definitions: elapsed since the snapshot was taken. When false (default), kvmclock resumes from where it was at snapshot time. This option may be extended to other clock sources and CPU architectures in the future." + pmem_overrides: + type: array + description: Pmem device backing file paths to override on snapshot restore. + items: + $ref: "#/definitions/PmemOverride" TokenBucket: diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 9112eef66cb..b7c0d722f1d 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -34,7 +34,9 @@ use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigError, MachineConfigUpdate}; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, MemBackendType}; +use crate::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotParams, MemBackendType, PmemOverride, +}; use crate::vstate::kvm::KvmState; use crate::vstate::memory::{ self, GuestMemoryState, GuestRegionMmap, GuestRegionType, MemoryError, @@ -412,6 +414,8 @@ pub fn restore_from_snapshot( .clone_from(&vsock_override.uds_path); } + apply_pmem_overrides(&mut microvm_state, ¶ms.pmem_overrides)?; + let track_dirty_pages = params.track_dirty_pages; let vcpu_count = microvm_state @@ -475,6 +479,38 @@ pub fn restore_from_snapshot( .map_err(RestoreFromSnapshotError::Build) } +/// Applies the pmem backing file path overrides to the restored microVM state. +/// +/// Each override identifies a pmem device by its id and replaces its +/// `path_on_host`. Both MMIO and PCI transports are searched. An override +/// targeting an id that does not match any pmem device returns an error. +fn apply_pmem_overrides( + microvm_state: &mut MicrovmState, + overrides: &[PmemOverride], +) -> Result<(), SnapshotStateFromFileError> { + for entry in overrides { + microvm_state + .device_states + .mmio_state + .pmem_devices + .iter_mut() + .map(|device| &mut device.device_state) + .chain( + microvm_state + .device_states + .pci_state + .pmem_devices + .iter_mut() + .map(|device| &mut device.device_state), + ) + .find(|state| state.config.id == entry.id) + .map(|state| state.config.path_on_host.clone_from(&entry.path_on_host)) + .ok_or_else(|| SnapshotStateFromFileError::UnknownPmemDevice(entry.id.clone()))?; + } + + Ok(()) +} + /// Error type for [`snapshot_state_from_file`] #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum SnapshotStateFromFileError { @@ -486,6 +522,8 @@ pub enum SnapshotStateFromFileError { UnknownNetworkDevice, /// Unknown Vsock Device. UnknownVsockDevice, + /// Unknown Pmem device: {0} + UnknownPmemDevice(String), } fn snapshot_state_from_file( @@ -657,7 +695,7 @@ mod tests { use crate::builder::tests::insert_vmgenid_device; use crate::builder::tests::{ CustomBlockConfig, default_kernel_cmdline, default_vmm, insert_balloon_device, - insert_block_devices, insert_net_device, insert_vsock_device, + insert_block_devices, insert_net_device, insert_pmem_devices, insert_vsock_device, }; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; @@ -665,6 +703,7 @@ mod tests { use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::pmem::PmemConfig; use crate::vmm_config::vsock::tests::default_config; use crate::vstate::memory::{GuestMemoryRegionState, GuestRegionType}; @@ -717,6 +756,18 @@ mod tests { insert_vsock_device(&mut vmm, &mut cmdline, &mut event_manager, vsock_config); + // Add a pmem device. The returned temp file is dropped immediately; only + // the saved device state (which carries the configured id/path) is needed. + let pmem_config = PmemConfig { + id: String::from("pmem0"), + path_on_host: String::new(), + root_device: false, + read_only: false, + rate_limiter: None, + }; + let _pmem_files = + insert_pmem_devices(&mut vmm, &mut cmdline, &mut event_manager, vec![pmem_config]); + #[cfg(target_arch = "x86_64")] insert_vmgenid_device(&mut vmm); #[cfg(target_arch = "x86_64")] @@ -765,6 +816,97 @@ mod tests { ) } + fn microvm_state_with_devices() -> MicrovmState { + let vmm = default_vmm_with_devices(); + let device_states = vmm.device_manager.save(); + + let vcpu_states = vec![VcpuState::default()]; + #[cfg(target_arch = "aarch64")] + let mpidrs = construct_kvm_mpidrs(&vcpu_states); + MicrovmState { + device_states, + vcpu_states, + kvm_state: Default::default(), + vm_info: VmInfo { + mem_size_mib: 1u64, + ..Default::default() + }, + #[cfg(target_arch = "aarch64")] + vm_state: vmm.vm.as_kvm().unwrap().save_state(&mpidrs).unwrap(), + #[cfg(target_arch = "x86_64")] + vm_state: vmm.vm.as_kvm().unwrap().save_state().unwrap(), + } + } + + fn pmem_path(state: &MicrovmState, id: &str) -> Option { + state + .device_states + .mmio_state + .pmem_devices + .iter() + .map(|device| &device.device_state) + .chain( + state + .device_states + .pci_state + .pmem_devices + .iter() + .map(|device| &device.device_state), + ) + .find(|s| s.config.id == id) + .map(|s| s.config.path_on_host.clone()) + } + + #[test] + fn test_apply_pmem_overrides_happy_path() { + let mut microvm_state = microvm_state_with_devices(); + // Sanity: the device exists and its path differs from what we override to. + assert!(pmem_path(µvm_state, "pmem0").is_some()); + + let overrides = vec![PmemOverride { + id: String::from("pmem0"), + path_on_host: String::from("/new/backing/file"), + }]; + + apply_pmem_overrides(&mut microvm_state, &overrides).unwrap(); + + assert_eq!( + pmem_path(µvm_state, "pmem0").as_deref(), + Some("/new/backing/file") + ); + } + + #[test] + fn test_apply_pmem_overrides_unknown_device() { + let mut microvm_state = microvm_state_with_devices(); + + let overrides = vec![PmemOverride { + id: String::from("does-not-exist"), + path_on_host: String::from("/new/backing/file"), + }]; + + let err = apply_pmem_overrides(&mut microvm_state, &overrides).unwrap_err(); + assert!( + matches!( + &err, + SnapshotStateFromFileError::UnknownPmemDevice(id) if id == "does-not-exist" + ), + "unexpected error: {err:?}" + ); + // The existing device must be left untouched. + assert!(pmem_path(µvm_state, "pmem0").is_some()); + } + + #[test] + fn test_apply_pmem_overrides_empty_is_noop() { + let mut microvm_state = microvm_state_with_devices(); + let original = pmem_path(µvm_state, "pmem0"); + + apply_pmem_overrides(&mut microvm_state, &[]).unwrap(); + + assert_eq!(pmem_path(µvm_state, "pmem0"), original); + } + #[test] fn test_create_guest_memory() { let mem_state = GuestMemoryState { diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 7693b305f65..dc6330294fc 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -1331,6 +1331,7 @@ mod tests { network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], }, ))); check_unsupported(runtime_request(VmmAction::SetEntropyDevice( diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 4f312f88f5d..ab3858d62c1 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -64,6 +64,16 @@ pub struct VsockOverride { pub uds_path: String, } +/// Allows for changing the host backing file of a virtio-pmem device during +/// snapshot restore. +#[derive(Debug, PartialEq, Eq, Deserialize)] +pub struct PmemOverride { + /// The ID of the pmem device to modify. + pub id: String, + /// The new host path for the pmem device's backing file. + pub path_on_host: String, +} + /// Stores the configuration that will be used for loading a snapshot. #[derive(Debug, PartialEq, Eq)] pub struct LoadSnapshotParams { @@ -85,6 +95,8 @@ pub struct LoadSnapshotParams { /// advancing kvmclock by the wall-clock time elapsed since the snapshot was taken. When false /// (default), kvmclock resumes from where it was at snapshot time. pub clock_realtime: bool, + /// The pmem devices to override on load. + pub pmem_overrides: Vec, } /// Stores the configuration for loading a snapshot that is provided by the user. @@ -120,6 +132,9 @@ pub struct LoadSnapshotConfig { /// [x86_64 only] When set to true, passes `KVM_CLOCK_REALTIME` to `KVM_SET_CLOCK` on restore. #[serde(default)] pub clock_realtime: bool, + /// The pmem devices to override on load. + #[serde(default)] + pub pmem_overrides: Vec, } /// Stores the configuration used for managing snapshot memory. diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 45bc3301cf6..96b9d4c3a97 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -317,6 +317,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], })) .unwrap(); @@ -403,6 +404,7 @@ fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &s network_overrides: vec![], vsock_override: None, clock_realtime: false, + pmem_overrides: vec![], }); let err = preboot_api_controller.handle_preboot_request(req); assert!( diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 8339ae1695e..53346fa8aa3 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -1094,6 +1094,7 @@ def restore_from_snapshot( rename_interfaces: dict = None, vsock_override: str = None, clock_realtime: bool = False, + pmem_overrides: list = None, *, uffd_handler_name: str = None, ): @@ -1111,8 +1112,17 @@ def restore_from_snapshot( jailed_mem = Path("/") / jailed_snapshot.mem.name jailed_vmstate = Path("/") / jailed_snapshot.vmstate.name - snapshot_disks = [v for k, v in jailed_snapshot.disks.items()] - assert len(snapshot_disks) > 0, "Snapshot requires at least one disk." + # Skip auto-jailing pmem backing files that have an override - the caller + # is responsible for placing the backing file at the override path. This + # ensures the test exercises the override path rather than silently + # falling back to the snapshot's baked-in path. + overridden_pmem_ids = {o["id"] for o in (pmem_overrides or [])} + snapshot_disks = [ + v for k, v in jailed_snapshot.disks.items() if k not in overridden_pmem_ids + ] + assert ( + len(snapshot_disks) > 0 or overridden_pmem_ids + ), "Snapshot requires at least one disk." jailed_disks = [] for disk in snapshot_disks: jailed_disks.append(self.create_jailed_resource(disk)) @@ -1159,6 +1169,9 @@ def restore_from_snapshot( if clock_realtime: optional_kwargs["clock_realtime"] = clock_realtime + if pmem_overrides is not None: + optional_kwargs["pmem_overrides"] = pmem_overrides + self.api.snapshot_load.put( mem_backend=mem_backend, snapshot_path=str(jailed_vmstate), diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 7cecc80509b..4a9a2514c4f 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -625,6 +625,94 @@ def test_snapshot_rename_vsock( restored_vm.restore_from_snapshot(snapshot, vsock_override="/v.sock2", resume=True) +@pin_guest_kernel(GUEST_KERNEL_DEFAULT) +def test_snapshot_override_pmem(uvm_configured, microvm_factory): + """ + Test that we can restore a snapshot and point a virtio-pmem device to a + different host backing file. + """ + vm = uvm_configured + + pmem_fs = drive_tools.FilesystemFile( + os.path.join(vm.fsfiles, "pmem_scratch"), size=2 + ) + vm.add_pmem("pmem0", pmem_fs.path, False, False) + vm.add_net_iface() + vm.start() + + # Write a marker to the pmem device so we can verify the overridden file + # is used after restore. + marker = "pmem_override_marker" + vm.ssh.check_output("mkdir -p /tmp/pmem_mnt") + vm.ssh.check_output("mount /dev/pmem0 -o dax=always /tmp/pmem_mnt") + vm.ssh.check_output(f'echo "{marker}" > /tmp/pmem_mnt/marker') + + snapshot = vm.snapshot_full() + # Killing the VM flushes any guest writes to the backing file. + vm.kill() + + restored_vm = microvm_factory.build() + restored_vm.spawn() + + # Move (not copy) the snapshot's pmem backing file to a new path with a + # different basename. `restore_from_snapshot` skips auto-jailing pmem files + # that have an override, so the file is no longer reachable at its baked-in + # path. This guarantees the restore can only succeed if it actually consults + # the override path - otherwise the test fails. + original_pmem = snapshot.disks["pmem0"] + override_src = Path(vm.fsfiles) / "pmem0_override.img" + shutil.move(original_pmem, override_src) + # Hardlink the override into the restored VM's chroot and chown it to the + # jailer's uid:gid so Firecracker can open it. + override_jailed = restored_vm.create_jailed_resource(override_src) + + restored_vm.restore_from_snapshot( + snapshot, + pmem_overrides=[ + {"id": "pmem0", "path_on_host": override_jailed}, + ], + resume=True, + ) + + # The override file is the same backing file moved aside after the marker + # was written, so the marker must still be readable through the pmem device. + _, stdout, _ = restored_vm.ssh.check_output("cat /tmp/pmem_mnt/marker") + assert stdout.strip() == marker + + +@pin_guest_kernel(GUEST_KERNEL_DEFAULT) +def test_pmem_override_fails_unknown_id(uvm_configured, microvm_factory): + """ + Providing a pmem override with an unknown id should fail snapshot + restore and cause Firecracker to exit. + """ + vm = uvm_configured + + pmem_fs = drive_tools.FilesystemFile( + os.path.join(vm.fsfiles, "pmem_scratch"), size=2 + ) + vm.add_pmem("pmem0", pmem_fs.path, False, False) + vm.add_net_iface() + vm.start() + + snapshot = vm.snapshot_full() + vm.kill() + + restored_vm = microvm_factory.build() + restored_vm.spawn() + + with pytest.raises(RuntimeError, match="Unknown Pmem device"): + restored_vm.restore_from_snapshot( + snapshot, + pmem_overrides=[ + {"id": "nonexistent", "path_on_host": "/fake/path"}, + ], + resume=True, + ) + + restored_vm.mark_killed() + + SLEEP_SECONDS = 30 CLOCK_SOURCES = {"x86_64": ["tsc", "kvm-clock"], "aarch64": ["arch_sys_counter"]}[