diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 9946d3516cc..e09c0887a3a 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -9,7 +9,7 @@ use std::ffi::CString; use std::fmt::Debug; use vm_fdt::{Error as VmFdtError, FdtWriter, FdtWriterNode}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, GuestMemoryRegion}; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; @@ -22,7 +22,7 @@ use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; -use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestRegionType}; // This is a value for uniquely identifying the FDT node declaring the interrupt controller. const GIC_PHANDLE: u32 = 1; @@ -227,14 +227,20 @@ fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &GuestMemoryMmap) -> Resul // The reason we do this is that Linux does not allow remapping system memory. However, without // remap, kernel drivers cannot get virtual addresses to read data from device memory. Leaving // this memory region out allows Linux kernel modules to remap and thus read this region. - let mem_size = guest_mem.last_addr().raw_value() - - super::layout::DRAM_MEM_START - - super::layout::SYSTEM_MEM_SIZE - + 1; - let mem_reg_prop = &[ - super::layout::DRAM_MEM_START + super::layout::SYSTEM_MEM_SIZE, - mem_size, - ]; + + // Pick the first (and only) memory region + let dram_region = guest_mem + .iter() + .find(|region| region.region_type == GuestRegionType::Dram) + .unwrap(); + // Find the start of memory after the system memory region + let start_addr = dram_region + .start_addr() + .unchecked_add(super::layout::SYSTEM_MEM_SIZE); + // Size of the memory is the region size minus the system memory size + let mem_size = dram_region.len() - super::layout::SYSTEM_MEM_SIZE; + + let mem_reg_prop = &[start_addr.raw_value(), mem_size]; let mem = fdt.begin_node("memory@ram")?; fdt.property_string("device_type", "memory")?; fdt.property_array_u64("reg", mem_reg_prop)?; diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 74c5204af0e..4e82a7d3d56 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -22,7 +22,7 @@ use std::fs::File; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, GuestMemoryRegion}; use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; @@ -30,7 +30,9 @@ use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; -use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{ + Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, GuestRegionType, +}; use crate::vstate::vcpu::KvmVcpuError; use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; @@ -151,31 +153,29 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti usize_to_u64(initrd_size), usize_to_u64(super::GUEST_PAGE_SIZE), ); - match GuestAddress(get_fdt_addr(guest_mem)).checked_sub(rounded_size) { - Some(offset) => { - if guest_mem.address_in_range(offset) { - Some(offset.raw_value()) - } else { - None - } - } - None => None, - } + GuestAddress(get_fdt_addr(guest_mem)) + .checked_sub(rounded_size) + .filter(|&addr| guest_mem.address_in_range(addr)) + .map(|addr| addr.raw_value()) } // Auxiliary function to get the address where the device tree blob is loaded. fn get_fdt_addr(mem: &GuestMemoryMmap) -> u64 { + // Find the first (and only) DRAM region. + let dram_region = mem + .iter() + .find(|region| region.region_type == GuestRegionType::Dram) + .unwrap(); + // If the memory allocated is smaller than the size allocated for the FDT, // we return the start of the DRAM so that // we allow the code to try and load the FDT. - - if let Some(addr) = mem.last_addr().checked_sub(layout::FDT_MAX_SIZE as u64 - 1) - && mem.address_in_range(addr) - { - return addr.raw_value(); - } - - layout::DRAM_MEM_START + dram_region + .last_addr() + .checked_sub(layout::FDT_MAX_SIZE as u64 - 1) + .filter(|&addr| mem.address_in_range(addr)) + .map(|addr| addr.raw_value()) + .unwrap_or(layout::DRAM_MEM_START) } /// Load linux kernel into guest memory. diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index b18267c6a1e..4d6b906768e 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -31,12 +31,13 @@ pub mod xstate; #[allow(missing_docs)] pub mod generated; +use std::cmp::max; use std::fs::File; use kvm::Kvm; use layout::{ - CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, - MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, + CMDLINE_START, MMIO32_MEM_SIZE, MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, + PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, }; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::pvh::PvhBootConfigurator; @@ -59,7 +60,7 @@ use crate::initrd::InitrdConfig; use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ - Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, + Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, GuestRegionType, }; use crate::vstate::vcpu::KvmVcpuConfigureError; use crate::{Vcpu, VcpuConfig, Vm, logger}; @@ -253,10 +254,6 @@ fn configure_pvh( initrd: &Option, ) -> Result<(), ConfigurationError> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; - let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); - let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); - let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); // Vector to hold modules (currently either empty or holding initrd). @@ -294,36 +291,21 @@ fn configure_pvh( type_: E820_RESERVED, ..Default::default() }); - let last_addr = guest_mem.last_addr(); - if last_addr > first_addr_past_64bits { - memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_64bits.raw_value(), - size: last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - } - - if last_addr > first_addr_past_32bits { + for region in guest_mem + .iter() + .filter(|region| region.region_type == GuestRegionType::Dram) + { + // the first 1MB is reserved for the kernel + let addr = max(himem_start, region.start_addr()); memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_32bits.raw_value(), - size: (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) - .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), + addr: addr.raw_value(), + size: region.last_addr().unchecked_offset_from(addr) + 1, type_: MEMMAP_TYPE_RAM, ..Default::default() }); } - memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: end_32bit_gap_start - .unchecked_offset_from(himem_start) - .min(last_addr.unchecked_offset_from(himem_start) + 1), - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - // Construct the hvm_start_info structure and serialize it into // boot_params. This will be stored at PVH_INFO_START address, and %rbx // will be initialized to contain PVH_INFO_START prior to starting the @@ -368,10 +350,6 @@ fn configure_64bit_boot( const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; const KERNEL_LOADER_OTHER: u8 = 0xff; const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. - let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); - let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); - let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); @@ -409,35 +387,20 @@ fn configure_64bit_boot( E820_RESERVED, )?; - let last_addr = guest_mem.last_addr(); - - if last_addr > first_addr_past_64bits { - add_e820_entry( - &mut params, - first_addr_past_64bits.raw_value(), - last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, - E820_RAM, - )?; - } - - if last_addr > first_addr_past_32bits { + for region in guest_mem + .iter() + .filter(|region| region.region_type == GuestRegionType::Dram) + { + // the first 1MB is reserved for the kernel + let addr = max(himem_start, region.start_addr()); add_e820_entry( &mut params, - first_addr_past_32bits.raw_value(), - (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) - .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), + addr.raw_value(), + region.last_addr().unchecked_offset_from(addr) + 1, E820_RAM, )?; } - add_e820_entry( - &mut params, - himem_start.raw_value(), - (last_addr.unchecked_offset_from(himem_start) + 1) - .min(end_32bit_gap_start.unchecked_offset_from(himem_start)), - E820_RAM, - )?; - LinuxBootConfigurator::write_bootparams( &BootParams::new(¶ms, GuestAddress(layout::ZERO_PAGE_START)), guest_mem, @@ -573,6 +536,7 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; + use crate::arch::x86_64::layout::FIRST_ADDR_PAST_32BITS; use crate::test_utils::{arch_mem, single_region_mem}; use crate::utils::mib_to_bytes; use crate::vstate::resources::ResourceAllocator; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 0dea14207ad..0f9ef70813e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -171,7 +171,7 @@ pub fn build_microvm_for_boot( // Build custom CPU config if a custom template is provided. let mut vm = Vm::new(&kvm)?; let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; - vm.register_memory_regions(guest_memory)?; + vm.register_dram_memory_regions(guest_memory)?; let mut device_manager = DeviceManager::new( event_manager, @@ -421,7 +421,7 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - vm.register_memory_regions(guest_memory) + vm.restore_memory_regions(guest_memory, µvm_state.vm_state.memory) .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] @@ -472,7 +472,6 @@ pub fn build_microvm_from_snapshot( event_manager, vm_resources, instance_id: &instance_info.id, - restored_from_file: uffd.is_none(), vcpus_exit_evt: &vcpus_exit_evt, }; #[allow(unused_mut)] diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index f7514ed1e0c..c3a209a4b26 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -587,7 +587,7 @@ pub(crate) mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_dram_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); @@ -633,7 +633,7 @@ pub(crate) mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_dram_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); @@ -686,7 +686,7 @@ pub(crate) mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_dram_memory_regions(guest_mem).unwrap(); #[cfg(target_arch = "x86_64")] vm.setup_irqchip().unwrap(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index f35190b5841..8e894d0fc06 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -440,7 +440,6 @@ pub struct DeviceRestoreArgs<'a> { pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, - pub restored_from_file: bool, } impl std::fmt::Debug for DeviceRestoreArgs<'_> { @@ -450,7 +449,6 @@ impl std::fmt::Debug for DeviceRestoreArgs<'_> { .field("vm", &self.vm) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) - .field("restored_from_file", &self.restored_from_file) .finish() } } @@ -488,7 +486,6 @@ impl<'a> Persist<'a> for DeviceManager { event_manager: constructor_args.event_manager, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, - restored_from_file: constructor_args.restored_from_file, }; let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; @@ -506,7 +503,6 @@ impl<'a> Persist<'a> for DeviceManager { mem: constructor_args.mem, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, - restored_from_file: constructor_args.restored_from_file, event_manager: constructor_args.event_manager, }; let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index f4f9ffe69d2..bf5252d2d17 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -249,7 +249,6 @@ pub struct PciDevicesConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, - pub restored_from_file: bool, pub event_manager: &'a mut EventManager, } @@ -260,7 +259,6 @@ impl<'a> Debug for PciDevicesConstructorArgs<'a> { .field("mem", &self.mem) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) - .field("restored_from_file", &self.restored_from_file) .finish() } } @@ -425,10 +423,7 @@ impl<'a> Persist<'a> for PciDevices { if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new( Balloon::restore( - BalloonConstructorArgs { - mem: mem.clone(), - restored_from_file: constructor_args.restored_from_file, - }, + BalloonConstructorArgs { mem: mem.clone() }, &balloon_state.device_state, ) .unwrap(), @@ -723,7 +718,6 @@ mod tests { mem: vmm.vm.guest_memory(), vm_resources, instance_id: "microvm-id", - restored_from_file: true, event_manager: &mut event_manager, }; let _restored_dev_manager = diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index d8d486d9ed7..7616f252658 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -154,7 +154,6 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub event_manager: &'a mut EventManager, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, - pub restored_from_file: bool, } impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -432,10 +431,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new(Balloon::restore( - BalloonConstructorArgs { - mem: mem.clone(), - restored_from_file: constructor_args.restored_from_file, - }, + BalloonConstructorArgs { mem: mem.clone() }, &balloon_state.device_state, )?)); @@ -743,7 +739,6 @@ mod tests { event_manager: &mut event_manager, vm_resources, instance_id: "microvm-id", - restored_from_file: true, }; let _restored_dev_manager = MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 87a82c4fa9d..f0ecf77bc9e 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -14,7 +14,7 @@ use super::super::ActivateError; use super::super::device::{DeviceState, VirtioDevice}; use super::super::queue::Queue; use super::metrics::METRICS; -use super::util::{compact_page_frame_numbers, remove_range}; +use super::util::compact_page_frame_numbers; use super::{ BALLOON_DEV_ID, BALLOON_NUM_QUEUES, BALLOON_QUEUE_SIZES, DEFLATE_INDEX, INFLATE_INDEX, MAX_PAGE_COMPACT_BUFFER, MAX_PAGES_IN_DESC, MIB_TO_4K_PAGES, STATS_INDEX, @@ -32,7 +32,9 @@ use crate::devices::virtio::queue::InvalidAvailIdx; use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; -use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +use crate::vstate::memory::{ + Address, ByteValued, Bytes, GuestAddress, GuestMemoryExtension, GuestMemoryMmap, +}; use crate::{impl_device_type, mem_size_mib}; const SIZE_OF_U32: usize = std::mem::size_of::(); @@ -171,7 +173,6 @@ pub struct Balloon { pub(crate) device_state: DeviceState, // Implementation specific fields. - pub(crate) restored_from_file: bool, pub(crate) stats_polling_interval_s: u16, pub(crate) stats_timer: TimerFd, // The index of the previous stats descriptor is saved because @@ -188,7 +189,6 @@ impl Balloon { amount_mib: u32, deflate_on_oom: bool, stats_polling_interval_s: u16, - restored_from_file: bool, ) -> Result { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; @@ -228,7 +228,6 @@ impl Balloon { queues, device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, - restored_from_file, stats_polling_interval_s, stats_timer, stats_desc_index: None, @@ -335,10 +334,9 @@ impl Balloon { let guest_addr = GuestAddress(u64::from(page_frame_number) << VIRTIO_BALLOON_PFN_SHIFT); - if let Err(err) = remove_range( - mem, - (guest_addr, u64::from(range_len) << VIRTIO_BALLOON_PFN_SHIFT), - self.restored_from_file, + if let Err(err) = mem.discard_range( + guest_addr, + usize::try_from(range_len).unwrap() << VIRTIO_BALLOON_PFN_SHIFT, ) { error!("Error removing memory range: {:?}", err); } @@ -739,7 +737,7 @@ pub(crate) mod tests { // Test all feature combinations. for deflate_on_oom in [true, false].iter() { for stats_interval in [0, 1].iter() { - let mut balloon = Balloon::new(0, *deflate_on_oom, *stats_interval, false).unwrap(); + let mut balloon = Balloon::new(0, *deflate_on_oom, *stats_interval).unwrap(); assert_eq!(balloon.device_type(), VIRTIO_ID_BALLOON); let features: u64 = (1u64 << VIRTIO_F_VERSION_1) @@ -766,7 +764,7 @@ pub(crate) mod tests { #[test] fn test_virtio_read_config() { - let balloon = Balloon::new(0x10, true, 0, false).unwrap(); + let balloon = Balloon::new(0x10, true, 0).unwrap(); let cfg = BalloonConfig { amount_mib: 16, @@ -800,7 +798,7 @@ pub(crate) mod tests { #[test] fn test_virtio_write_config() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let expected_config_space: [u8; BALLOON_CONFIG_SPACE_SIZE] = [0x00, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]; @@ -826,7 +824,7 @@ pub(crate) mod tests { #[test] fn test_invalid_request() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); // Only initialize the inflate queue to demonstrate invalid request handling. @@ -887,7 +885,7 @@ pub(crate) mod tests { #[test] fn test_inflate() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -959,7 +957,7 @@ pub(crate) mod tests { #[test] fn test_deflate() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -1009,7 +1007,7 @@ pub(crate) mod tests { #[test] fn test_stats() { - let mut balloon = Balloon::new(0, true, 1, false).unwrap(); + let mut balloon = Balloon::new(0, true, 1).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -1101,7 +1099,7 @@ pub(crate) mod tests { #[test] fn test_process_balloon_queues() { - let mut balloon = Balloon::new(0x10, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0x10, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -1116,7 +1114,7 @@ pub(crate) mod tests { #[test] fn test_update_stats_interval() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); @@ -1129,7 +1127,7 @@ pub(crate) mod tests { ); balloon.update_stats_polling_interval(0).unwrap(); - let mut balloon = Balloon::new(0, true, 1, false).unwrap(); + let mut balloon = Balloon::new(0, true, 1).unwrap(); let mem = default_mem(); let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); @@ -1147,14 +1145,14 @@ pub(crate) mod tests { #[test] fn test_cannot_update_inactive_device() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); // Assert that we can't update an inactive device. balloon.update_size(1).unwrap_err(); } #[test] fn test_num_pages() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); // Switch the state to active. balloon.device_state = DeviceState::Activated(ActiveState { mem: single_region_mem(32 << 20), diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 3922b4b8385..6fdd00c434c 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -142,7 +142,7 @@ pub mod tests { #[test] fn test_event_handler() { let mut event_manager = EventManager::new().unwrap(); - let mut balloon = Balloon::new(0, true, 10, false).unwrap(); + let mut balloon = Balloon::new(0, true, 10).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index c5fa227c620..e92356c394e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -95,7 +95,6 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - pub restored_from_file: bool, } impl Persist<'_> for Balloon { @@ -122,12 +121,7 @@ impl Persist<'_> for Balloon { ) -> Result { // We can safely create the balloon with arbitrary flags and // num_pages because we will overwrite them after. - let mut balloon = Balloon::new( - 0, - false, - state.stats_polling_interval_s, - constructor_args.restored_from_file, - )?; + let mut balloon = Balloon::new(0, false, state.stats_polling_interval_s)?; let mut num_queues = BALLOON_NUM_QUEUES; // As per the virtio 1.1 specification, the statistics queue @@ -184,7 +178,7 @@ mod tests { let mut mem = vec![0; 4096]; // Create and save the balloon device. - let balloon = Balloon::new(0x42, false, 2, false).unwrap(); + let balloon = Balloon::new(0x42, false, 2).unwrap(); Snapshot::new(balloon.save()) .save(&mut mem.as_mut_slice()) @@ -192,10 +186,7 @@ mod tests { // Deserialize and restore the balloon device. let restored_balloon = Balloon::restore( - BalloonConstructorArgs { - mem: guest_mem, - restored_from_file: true, - }, + BalloonConstructorArgs { mem: guest_mem }, &Snapshot::load_without_crc_check(mem.as_slice()) .unwrap() .data, @@ -203,7 +194,6 @@ mod tests { .unwrap(); assert_eq!(restored_balloon.device_type(), VIRTIO_ID_BALLOON); - assert!(restored_balloon.restored_from_file); assert_eq!(restored_balloon.acked_features, balloon.acked_features); assert_eq!(restored_balloon.avail_features, balloon.avail_features); diff --git a/src/vmm/src/devices/virtio/balloon/util.rs b/src/vmm/src/devices/virtio/balloon/util.rs index 35ef69f972f..1d00891783e 100644 --- a/src/vmm/src/devices/virtio/balloon/util.rs +++ b/src/vmm/src/devices/virtio/balloon/util.rs @@ -65,57 +65,6 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { result } -pub(crate) fn remove_range( - guest_memory: &GuestMemoryMmap, - range: (GuestAddress, u64), - restored_from_file: bool, -) -> Result<(), RemoveRegionError> { - let (guest_address, range_len) = range; - - if let Some(region) = guest_memory.find_region(guest_address) { - if guest_address.0 + range_len > region.start_addr().0 + region.len() { - return Err(RemoveRegionError::MalformedRange); - } - let phys_address = guest_memory - .get_host_address(guest_address) - .map_err(|_| RemoveRegionError::AddressTranslation)?; - - // Mmap a new anonymous region over the present one in order to create a hole. - // This workaround is (only) needed after resuming from a snapshot because the guest memory - // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored_from_file { - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - libc::mmap( - phys_address.cast(), - u64_to_usize(range_len), - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); - } - }; - - // Madvise the region in order to mark it as not used. - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - let range_len = u64_to_usize(range_len); - libc::madvise(phys_address.cast(), range_len, libc::MADV_DONTNEED) - }; - if ret < 0 { - return Err(RemoveRegionError::MadviseFail(io::Error::last_os_error())); - } - - Ok(()) - } else { - Err(RemoveRegionError::RegionNotFound) - } -} - #[cfg(test)] mod tests { use std::fmt::Debug; @@ -174,88 +123,6 @@ mod tests { ); } - #[test] - fn test_remove_range() { - let page_size: usize = 0x1000; - let mem = single_region_mem(2 * page_size); - - // Fill the memory with ones. - let ones = vec![1u8; 2 * page_size]; - mem.write(&ones[..], GuestAddress(0)).unwrap(); - - // Remove the first page. - remove_range(&mem, (GuestAddress(0), page_size as u64), false).unwrap(); - - // Check that the first page is zeroed. - let mut actual_page = vec![0u8; page_size]; - mem.read(actual_page.as_mut_slice(), GuestAddress(0)) - .unwrap(); - assert_eq!(vec![0u8; page_size], actual_page); - // Check that the second page still contains ones. - mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) - .unwrap(); - assert_eq!(vec![1u8; page_size], actual_page); - - // Malformed range: the len is too big. - assert_match!( - remove_range(&mem, (GuestAddress(0), 0x10000), false).unwrap_err(), - RemoveRegionError::MalformedRange - ); - - // Region not mapped. - assert_match!( - remove_range(&mem, (GuestAddress(0x10000), 0x10), false).unwrap_err(), - RemoveRegionError::RegionNotFound - ); - - // Madvise fail: the guest address is not aligned to the page size. - assert_match!( - remove_range(&mem, (GuestAddress(0x20), page_size as u64), false).unwrap_err(), - RemoveRegionError::MadviseFail(_) - ); - } - - #[test] - fn test_remove_range_on_restored() { - let page_size: usize = 0x1000; - let mem = single_region_mem(2 * page_size); - - // Fill the memory with ones. - let ones = vec![1u8; 2 * page_size]; - mem.write(&ones[..], GuestAddress(0)).unwrap(); - - // Remove the first page. - remove_range(&mem, (GuestAddress(0), page_size as u64), true).unwrap(); - - // Check that the first page is zeroed. - let mut actual_page = vec![0u8; page_size]; - mem.read(actual_page.as_mut_slice(), GuestAddress(0)) - .unwrap(); - assert_eq!(vec![0u8; page_size], actual_page); - // Check that the second page still contains ones. - mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) - .unwrap(); - assert_eq!(vec![1u8; page_size], actual_page); - - // Malformed range: the len is too big. - assert_match!( - remove_range(&mem, (GuestAddress(0), 0x10000), true).unwrap_err(), - RemoveRegionError::MalformedRange - ); - - // Region not mapped. - assert_match!( - remove_range(&mem, (GuestAddress(0x10000), 0x10), true).unwrap_err(), - RemoveRegionError::RegionNotFound - ); - - // Mmap fail: the guest address is not aligned to the page size. - assert_match!( - remove_range(&mem, (GuestAddress(0x20), page_size as u64), true).unwrap_err(), - RemoveRegionError::MmapFail(_) - ); - } - /// ------------------------------------- /// BEGIN PROPERTY BASED TESTING use proptest::prelude::*; diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 09cc7c4e31d..b7aa8061d76 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -179,6 +179,7 @@ pub mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use std::os::unix::ffi::OsStrExt; + use vm_memory::GuestMemoryRegion; use vmm_sys_util::tempfile::TempFile; use super::*; @@ -186,7 +187,7 @@ pub mod tests { use crate::utils::u64_to_usize; use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory; - use crate::vstate::memory::{Bitmap, Bytes, GuestMemory}; + use crate::vstate::memory::{Bitmap, Bytes, GuestMemory, GuestRegionMmapExt}; const FILE_LEN: u32 = 1024; // 2 pages of memory should be enough to test read/write ops and also dirty tracking. @@ -227,20 +228,23 @@ pub mod tests { true, HugePageConfig::None, ) - .unwrap(), + .unwrap() + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), ) .unwrap() } fn check_dirty_mem(mem: &GuestMemoryMmap, addr: GuestAddress, len: u32) { - let bitmap = mem.find_region(addr).unwrap().bitmap().as_ref().unwrap(); + let bitmap = mem.find_region(addr).unwrap().bitmap(); for offset in addr.0..addr.0 + u64::from(len) { assert!(bitmap.dirty_at(u64_to_usize(offset))); } } fn check_clean_mem(mem: &GuestMemoryMmap, addr: GuestAddress, len: u32) { - let bitmap = mem.find_region(addr).unwrap().bitmap().as_ref().unwrap(); + let bitmap = mem.find_region(addr).unwrap().bitmap(); for offset in addr.0..addr.0 + u64::from(len) { assert!(!bitmap.dirty_at(u64_to_usize(offset))); } diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index a6157834d90..831595f0940 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -378,7 +378,7 @@ impl VhostUserHandleImpl { let vhost_user_net_reg = VhostUserMemoryRegionInfo { guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, + userspace_addr: region.inner.as_ptr() as u64, mmap_offset, mmap_handle, }; @@ -478,7 +478,7 @@ pub(crate) mod tests { use crate::devices::virtio::test_utils::default_interrupt; use crate::test_utils::create_tmp_socket; use crate::vstate::memory; - use crate::vstate::memory::GuestAddress; + use crate::vstate::memory::{GuestAddress, GuestRegionMmapExt}; pub(crate) fn create_mem(file: File, regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { GuestMemoryMmap::from_regions( @@ -488,7 +488,10 @@ pub(crate) mod tests { Some(file), false, ) - .unwrap(), + .unwrap() + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), ) .unwrap() } @@ -798,7 +801,7 @@ pub(crate) mod tests { .map(|region| VhostUserMemoryRegionInfo { guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, + userspace_addr: region.inner.as_ptr() as u64, mmap_offset: region.file_offset().unwrap().start(), mmap_handle: region.file_offset().unwrap().file().as_raw_fd(), }) diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 212b6105831..ee76bf6800b 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -588,7 +588,7 @@ mod tests { use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; - use crate::vstate::memory::GuestMemoryRegionState; + use crate::vstate::memory::{GuestMemoryRegionState, GuestRegionType}; fn default_vmm_with_devices() -> Vmm { let mut event_manager = EventManager::new().expect("Cannot create EventManager"); @@ -693,6 +693,7 @@ mod tests { regions: vec![GuestMemoryRegionState { base_address: 0, size: 0x20000, + region_type: GuestRegionType::Dram, }], }; diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index f0f4c1907d8..53f1185115a 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -6,6 +6,7 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex, MutexGuard}; use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; use crate::cpu_config::templates::CustomCpuTemplate; use crate::device_manager::persist::SharedDeviceType; @@ -481,11 +482,14 @@ impl VmResources { Ok(()) } - /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. + /// Allocates the given guest memory regions. /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result, MemoryError> { + fn allocate_memory_regions( + &self, + regions: &[(GuestAddress, usize)], + ) -> Result, MemoryError> { let vhost_user_device_used = self .block .devices @@ -501,22 +505,27 @@ impl VmResources { // because that would require running a backend process. If in the future we converge to // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. - let regions = - crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { memory::memfd_backed( - regions.as_ref(), + regions, self.machine_config.track_dirty_pages, self.machine_config.huge_pages, ) } else { memory::anonymous( - regions.into_iter(), + regions.iter().copied(), self.machine_config.track_dirty_pages, self.machine_config.huge_pages, ) } } + + /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. + pub fn allocate_guest_memory(&self) -> Result, MemoryError> { + let regions = + crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); + self.allocate_memory_regions(®ions) + } } impl From<&VmResources> for VmmConfig { @@ -1542,7 +1551,7 @@ mod tests { .unwrap(); let err = vm_resources .update_from_restored_device(SharedDeviceType::Balloon(Arc::new(Mutex::new( - Balloon::new(128, false, 0, true).unwrap(), + Balloon::new(128, false, 0).unwrap(), )))) .unwrap_err(); assert!( diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 2cfcc274b5d..89b4e238b3b 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -5,7 +5,7 @@ use std::sync::{Arc, Mutex}; -use vm_memory::GuestAddress; +use vm_memory::{GuestAddress, GuestRegionCollection}; use vmm_sys_util::tempdir::TempDir; use crate::builder::build_microvm_for_boot; @@ -15,8 +15,7 @@ use crate::test_utils::mock_resources::{MockBootSourceConfig, MockVmConfig, Mock use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::HugePageConfig; -use crate::vstate::memory; -use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap}; +use crate::vstate::memory::{self, GuestMemoryMmap, GuestRegionMmap, GuestRegionMmapExt}; use crate::{EventManager, Vmm}; pub mod mock_resources; @@ -43,9 +42,12 @@ pub fn single_region_mem_at_raw(at: u64, size: usize) -> Vec { /// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking. pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { - GuestMemoryMmap::from_regions( + GuestRegionCollection::from_regions( memory::anonymous(regions.iter().copied(), false, HugePageConfig::None) - .expect("Cannot initialize memory"), + .expect("Cannot initialize memory") + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), ) .unwrap() } diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index 83d419c49db..e56430d6dc6 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -88,9 +88,6 @@ impl BalloonBuilder { cfg.amount_mib, cfg.deflate_on_oom, cfg.stats_polling_interval_s, - // `restored` flag is false because this code path - // is never called by snapshot restore functionality. - false, )?))); Ok(()) @@ -178,7 +175,7 @@ pub(crate) mod tests { #[test] fn test_set_device() { let mut builder = BalloonBuilder::new(); - let balloon = Balloon::new(0, true, 0, true).unwrap(); + let balloon = Balloon::new(0, true, 0).unwrap(); builder.set_device(Arc::new(Mutex::new(balloon))); assert!(builder.inner.is_some()); } diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index fe506f6731e..c4185e78b2d 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -7,8 +7,11 @@ use std::fs::File; use std::io::SeekFrom; +use std::ops::Deref; use std::sync::Arc; +use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +use log::error; use serde::{Deserialize, Serialize}; pub use vm_memory::bitmap::{AtomicBitmap, BS, Bitmap, BitmapSlice}; pub use vm_memory::mmap::MmapRegionBuilder; @@ -17,7 +20,7 @@ pub use vm_memory::{ Address, ByteValued, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MemoryRegionAddress, MmapRegion, address, }; -use vm_memory::{GuestMemoryError, WriteVolatile}; +use vm_memory::{GuestMemoryError, GuestMemoryRegionBytes, VolatileSlice, WriteVolatile}; use vmm_sys_util::errno; use crate::DirtyBitmap; @@ -27,7 +30,7 @@ use crate::vmm_config::machine_config::HugePageConfig; /// Type of GuestRegionMmap. pub type GuestRegionMmap = vm_memory::GuestRegionMmap>; /// Type of GuestMemoryMmap. -pub type GuestMemoryMmap = vm_memory::GuestRegionCollection; +pub type GuestMemoryMmap = vm_memory::GuestRegionCollection; /// Type of GuestMmapRegion. pub type GuestMmapRegion = vm_memory::MmapRegion>; @@ -52,6 +55,168 @@ pub enum MemoryError { FileMetadata(std::io::Error), } +/// Type of the guest region +#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub enum GuestRegionType { + /// Guest DRAM + Dram, +} + +/// An extension to GuestMemoryRegion that stores the type of region, and the KVM slot +/// number. +#[derive(Debug)] +pub struct GuestRegionMmapExt { + /// the wrapped GuestRegionMmap + pub inner: GuestRegionMmap, + /// the type of region + pub region_type: GuestRegionType, + /// the KVM slot number assigned to this region + pub slot: u32, +} + +impl GuestRegionMmapExt { + pub(crate) fn dram_from_mmap_region(region: GuestRegionMmap, slot: u32) -> Self { + GuestRegionMmapExt { + inner: region, + region_type: GuestRegionType::Dram, + slot, + } + } + + pub(crate) fn from_state( + region: GuestRegionMmap, + state: &GuestMemoryRegionState, + slot: u32, + ) -> Result { + Ok(GuestRegionMmapExt { + inner: region, + region_type: state.region_type, + slot, + }) + } + + pub(crate) fn kvm_userspace_memory_region(&self) -> kvm_userspace_memory_region { + let flags = if self.inner.bitmap().is_some() { + KVM_MEM_LOG_DIRTY_PAGES + } else { + 0 + }; + + kvm_userspace_memory_region { + flags, + slot: self.slot, + guest_phys_addr: self.inner.start_addr().raw_value(), + memory_size: self.inner.len(), + userspace_addr: self.inner.as_ptr() as u64, + } + } + + pub(crate) fn discard_range( + &self, + caddr: MemoryRegionAddress, + len: usize, + ) -> Result<(), GuestMemoryError> { + let phys_address = self.get_host_address(caddr)?; + + match (self.inner.file_offset(), self.inner.flags()) { + // If and only if we are resuming from a snapshot file, we have a file and it's mapped + // private + (Some(_), flags) if flags & libc::MAP_PRIVATE != 0 => { + // Mmap a new anonymous region over the present one in order to create a hole + // with zero pages. + // This workaround is (only) needed after resuming from a snapshot file because the + // guest memory is mmaped from file as private. In this case, MADV_DONTNEED on the + // file only drops any anonymous pages in range, but subsequent accesses would read + // whatever page is stored on the backing file. Mmapping anonymous pages ensures + // it's zeroed. + // SAFETY: The address and length are known to be valid. + let ret = unsafe { + libc::mmap( + phys_address.cast(), + len, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + -1, + 0, + ) + }; + if ret == libc::MAP_FAILED { + let os_error = std::io::Error::last_os_error(); + error!("discard_range: mmap failed: {:?}", os_error); + Err(GuestMemoryError::IOError(os_error)) + } else { + Ok(()) + } + } + // Match either the case of an anonymous mapping, or the case + // of a shared file mapping. + // TODO: madvise(MADV_DONTNEED) doesn't actually work with memfd + // (or in general MAP_SHARED of a fd). In those cases we should use + // fallocate64(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE). + // We keep falling to the madvise branch to keep the previous behaviour. + _ => { + // Madvise the region in order to mark it as not used. + // SAFETY: The address and length are known to be valid. + let ret = unsafe { libc::madvise(phys_address.cast(), len, libc::MADV_DONTNEED) }; + if ret < 0 { + let os_error = std::io::Error::last_os_error(); + error!("discard_range: madvise failed: {:?}", os_error); + Err(GuestMemoryError::IOError(os_error)) + } else { + Ok(()) + } + } + } + } +} + +impl Deref for GuestRegionMmapExt { + type Target = MmapRegion>; + + fn deref(&self) -> &MmapRegion> { + &self.inner + } +} + +impl GuestMemoryRegionBytes for GuestRegionMmapExt {} + +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +impl GuestMemoryRegion for GuestRegionMmapExt { + type B = Option; + + fn len(&self) -> GuestUsize { + self.inner.len() + } + + fn start_addr(&self) -> GuestAddress { + self.inner.start_addr() + } + + fn bitmap(&self) -> BS<'_, Self::B> { + self.inner.bitmap() + } + + fn get_host_address( + &self, + addr: MemoryRegionAddress, + ) -> vm_memory::guest_memory::Result<*mut u8> { + self.inner.get_host_address(addr) + } + + fn file_offset(&self) -> Option<&FileOffset> { + self.inner.file_offset() + } + + fn get_slice( + &self, + offset: MemoryRegionAddress, + count: usize, + ) -> vm_memory::guest_memory::Result>> { + self.inner.get_slice(offset, count) + } +} + /// Creates a `Vec` of `GuestRegionMmap` with the given configuration pub fn create( regions: impl Iterator, @@ -175,6 +340,19 @@ where /// Store the dirty bitmap in internal store fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize); + + /// Apply a function to each region in a memory range + fn try_for_each_region_in_range( + &self, + addr: GuestAddress, + range_len: usize, + f: F, + ) -> Result<(), GuestMemoryError> + where + F: FnMut(&GuestRegionMmapExt, MemoryRegionAddress, usize) -> Result<(), GuestMemoryError>; + + /// Discards a memory range, freeing up memory pages + fn discard_range(&self, addr: GuestAddress, range_len: usize) -> Result<(), GuestMemoryError>; } /// State of a guest memory region saved to file/buffer. @@ -186,6 +364,8 @@ pub struct GuestMemoryRegionState { pub base_address: u64, /// Region size. pub size: usize, + /// Region type + pub region_type: GuestRegionType, } /// Describes guest memory regions and their snapshot file mappings. @@ -213,6 +393,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { guest_memory_state.regions.push(GuestMemoryRegionState { base_address: region.start_addr().0, size: u64_to_usize(region.len()), + region_type: region.region_type, }); }); guest_memory_state @@ -242,8 +423,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size().map_err(MemoryError::PageSize)?; - let write_result = self.iter().zip(0..).try_for_each(|(region, slot)| { - let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let write_result = self.iter().try_for_each(|region| { + let kvm_bitmap = dirty_bitmap.get(®ion.slot).unwrap(); let firecracker_bitmap = region.bitmap(); let mut write_size = 0; let mut dirty_batch_start: u64 = 0; @@ -306,8 +487,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { /// Stores the dirty bitmap inside into the internal bitmap fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize) { - self.iter().zip(0..).for_each(|(region, slot)| { - let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + self.iter().for_each(|region| { + let kvm_bitmap = dirty_bitmap.get(®ion.slot).unwrap(); let firecracker_bitmap = region.bitmap(); for (i, v) in kvm_bitmap.iter().enumerate() { @@ -323,6 +504,49 @@ impl GuestMemoryExtension for GuestMemoryMmap { } }); } + + fn try_for_each_region_in_range( + &self, + addr: GuestAddress, + range_len: usize, + mut f: F, + ) -> Result<(), GuestMemoryError> + where + F: FnMut(&GuestRegionMmapExt, MemoryRegionAddress, usize) -> Result<(), GuestMemoryError>, + { + let mut cur = addr; + let mut remaining = range_len; + + // iterate over all adjacent consecutive regions in range + while let Some(region) = self.find_region(cur) { + let start = region.to_region_addr(cur).unwrap(); + let len = std::cmp::min( + // remaining bytes inside the region + u64_to_usize(region.len() - start.raw_value()), + // remaning bytes to discard + remaining, + ); + + f(region, start, len)?; + + remaining -= len; + if remaining == 0 { + return Ok(()); + } + + cur = cur + .checked_add(len as u64) + .ok_or(GuestMemoryError::GuestAddressOverflow)?; + } + // if we exit the loop because we didn't find a region, return an error + Err(GuestMemoryError::InvalidGuestAddress(cur)) + } + + fn discard_range(&self, addr: GuestAddress, range_len: usize) -> Result<(), GuestMemoryError> { + self.try_for_each_region_in_range(addr, range_len, |region, start, len| { + region.discard_range(start, len) + }) + } } fn create_memfd( @@ -355,6 +579,23 @@ fn create_memfd( Ok(mem_file) } +/// Test utilities +pub mod test_utils { + use super::*; + + /// Converts a vec of GuestRegionMmap into a GuestMemoryMmap using GuestRegionMmapExt + pub fn into_region_ext(regions: Vec) -> GuestMemoryMmap { + GuestMemoryMmap::from_regions( + regions + .into_iter() + .zip(0u32..) // assign dummy slots + .map(|(region, slot)| GuestRegionMmapExt::dram_from_mmap_region(region, slot)) + .collect(), + ) + .unwrap() + } +} + #[cfg(test)] mod tests { #![allow(clippy::undocumented_unsafe_blocks)] @@ -366,7 +607,9 @@ mod tests { use super::*; use crate::snapshot::Snapshot; + use crate::test_utils::single_region_mem; use crate::utils::{get_page_size, mib_to_bytes}; + use crate::vstate::memory::test_utils::into_region_ext; #[test] fn test_anonymous() { @@ -448,10 +691,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + into_region_ext(anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap()); let dirty_map = [ // page 0: not dirty @@ -486,7 +727,7 @@ mod tests { } } - fn check_serde(guest_memory: &GuestMemoryMmap) { + fn check_serde(guest_memory: &M) { let mut snapshot_data = vec![0u8; 10000]; let original_state = guest_memory.describe(); Snapshot::new(&original_state) @@ -504,15 +745,14 @@ mod tests { let region_size = page_size * 3; // Test with a single region - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous( [(GuestAddress(0), region_size)].into_iter(), false, HugePageConfig::None, ) .unwrap(), - ) - .unwrap(); + ); check_serde(&guest_memory); // Test with some regions @@ -521,10 +761,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + into_region_ext(anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap()); check_serde(&guest_memory); } @@ -537,20 +775,21 @@ mod tests { (GuestAddress(0), page_size), (GuestAddress(page_size as u64 * 2), page_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); let expected_memory_state = GuestMemoryState { regions: vec![ GuestMemoryRegionState { base_address: 0, size: page_size, + region_type: GuestRegionType::Dram, }, GuestMemoryRegionState { base_address: page_size as u64 * 2, size: page_size, + region_type: GuestRegionType::Dram, }, ], }; @@ -563,20 +802,21 @@ mod tests { (GuestAddress(0), page_size * 3), (GuestAddress(page_size as u64 * 4), page_size * 3), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); let expected_memory_state = GuestMemoryState { regions: vec![ GuestMemoryRegionState { base_address: 0, size: page_size * 3, + region_type: GuestRegionType::Dram, }, GuestMemoryRegionState { base_address: page_size as u64 * 4, size: page_size * 3, + region_type: GuestRegionType::Dram, }, ], }; @@ -597,10 +837,9 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -622,10 +861,8 @@ mod tests { let mut memory_file = TempFile::new().unwrap().into_file(); guest_memory.dump(&mut memory_file).unwrap(); - let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(memory_file, memory_state.regions(), false).unwrap(), - ) - .unwrap(); + let restored_guest_memory = + into_region_ext(snapshot_file(memory_file, memory_state.regions(), false).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -652,10 +889,9 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -684,10 +920,8 @@ mod tests { guest_memory.dump_dirty(&mut file, &dirty_bitmap).unwrap(); // We can restore from this because this is the first dirty dump. - let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(file, memory_state.regions(), false).unwrap(), - ) - .unwrap(); + let restored_guest_memory = + into_region_ext(snapshot_file(file, memory_state.regions(), false).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; @@ -743,10 +977,9 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { @@ -782,4 +1015,103 @@ mod tests { seals.insert(memfd::FileSeal::SealGrow); memfd.add_seals(&seals).unwrap_err(); } + + /// This asserts that $lhs matches $rhs. + macro_rules! assert_match { + ($lhs:expr, $rhs:pat) => {{ assert!(matches!($lhs, $rhs)) }}; + } + + #[test] + fn test_discard_range() { + let page_size: usize = 0x1000; + let mem = single_region_mem(2 * page_size); + + // Fill the memory with ones. + let ones = vec![1u8; 2 * page_size]; + mem.write(&ones[..], GuestAddress(0)).unwrap(); + + // Remove the first page. + mem.discard_range(GuestAddress(0), page_size).unwrap(); + + // Check that the first page is zeroed. + let mut actual_page = vec![0u8; page_size]; + mem.read(actual_page.as_mut_slice(), GuestAddress(0)) + .unwrap(); + assert_eq!(vec![0u8; page_size], actual_page); + // Check that the second page still contains ones. + mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) + .unwrap(); + assert_eq!(vec![1u8; page_size], actual_page); + + // Malformed range: the len is too big. + assert_match!( + mem.discard_range(GuestAddress(0), 0x10000).unwrap_err(), + GuestMemoryError::InvalidGuestAddress(_) + ); + + // Region not mapped. + assert_match!( + mem.discard_range(GuestAddress(0x10000), 0x10).unwrap_err(), + GuestMemoryError::InvalidGuestAddress(_) + ); + + // Madvise fail: the guest address is not aligned to the page size. + assert_match!( + mem.discard_range(GuestAddress(0x20), page_size) + .unwrap_err(), + GuestMemoryError::IOError(_) + ); + } + + #[test] + fn test_discard_range_on_file() { + let page_size: usize = 0x1000; + let mut memory_file = TempFile::new().unwrap().into_file(); + memory_file.set_len(2 * page_size as u64).unwrap(); + memory_file.write_all(&vec![2u8; 2 * page_size]).unwrap(); + let mem = into_region_ext( + snapshot_file( + memory_file, + std::iter::once((GuestAddress(0), 2 * page_size)), + false, + ) + .unwrap(), + ); + + // Fill the memory with ones. + let ones = vec![1u8; 2 * page_size]; + mem.write(&ones[..], GuestAddress(0)).unwrap(); + + // Remove the first page. + mem.discard_range(GuestAddress(0), page_size).unwrap(); + + // Check that the first page is zeroed. + let mut actual_page = vec![0u8; page_size]; + mem.read(actual_page.as_mut_slice(), GuestAddress(0)) + .unwrap(); + assert_eq!(vec![0u8; page_size], actual_page); + // Check that the second page still contains ones. + mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) + .unwrap(); + assert_eq!(vec![1u8; page_size], actual_page); + + // Malformed range: the len is too big. + assert_match!( + mem.discard_range(GuestAddress(0), 0x10000).unwrap_err(), + GuestMemoryError::InvalidGuestAddress(_) + ); + + // Region not mapped. + assert_match!( + mem.discard_range(GuestAddress(0x10000), 0x10).unwrap_err(), + GuestMemoryError::InvalidGuestAddress(_) + ); + + // Mmap fail: the guest address is not aligned to the page size. + assert_match!( + mem.discard_range(GuestAddress(0x20), page_size) + .unwrap_err(), + GuestMemoryError::IOError(_) + ); + } } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 0d60a298ecd..3cc2319b360 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -15,8 +15,8 @@ use std::sync::{Arc, Mutex, MutexGuard}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; use kvm_bindings::{ - KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, - KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MSI_VALID_DEVID, KvmIrqRouting, + kvm_irq_routing_entry, }; use kvm_ioctls::VmFd; use log::debug; @@ -34,7 +34,8 @@ use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::bus::Bus; use crate::vstate::interrupts::{InterruptError, MsixVector, MsixVectorConfig, MsixVectorGroup}; use crate::vstate::memory::{ - Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestMemoryState, + GuestRegionMmap, GuestRegionMmapExt, MemoryError, }; use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; @@ -88,7 +89,9 @@ pub enum VmError { /// Error calling mincore: {0} Mincore(vmm_sys_util::errno::Error), /// ResourceAllocator error: {0} - ResourceAllocator(#[from] vm_allocator::Error) + ResourceAllocator(#[from] vm_allocator::Error), + /// MemoryError error: {0} + MemoryError(#[from] MemoryError), } /// Contains Vm functions that are usable across CPU architectures @@ -172,49 +175,61 @@ impl Vm { } } + fn register_memory_region(&mut self, region: Arc) -> Result<(), VmError> { + let new_guest_memory = self + .common + .guest_memory + .insert_region(Arc::clone(®ion))?; + + // SAFETY: Safe because the fd is a valid KVM file descriptor. + unsafe { + self.fd() + .set_user_memory_region(region.kvm_userspace_memory_region()) + .map_err(VmError::SetUserMemoryRegion)?; + } + + self.common.guest_memory = new_guest_memory; + + Ok(()) + } + /// Register a list of new memory regions to this [`Vm`]. - pub fn register_memory_regions( + pub fn register_dram_memory_regions( &mut self, regions: Vec, ) -> Result<(), VmError> { for region in regions { - self.register_memory_region(region)? + let next_slot = self + .next_kvm_slot() + .ok_or(VmError::NotEnoughMemorySlots(self.common.max_memslots))?; + + let arcd_region = + Arc::new(GuestRegionMmapExt::dram_from_mmap_region(region, next_slot)); + + self.register_memory_region(arcd_region)? } Ok(()) } - /// Register a new memory region to this [`Vm`]. - pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { - let next_slot = self - .next_kvm_slot() - .ok_or(VmError::NotEnoughMemorySlots(self.common.max_memslots))?; - - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES - } else { - 0 - }; - - let memory_region = kvm_userspace_memory_region { - slot: next_slot, - guest_phys_addr: region.start_addr().raw_value(), - memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, - flags, - }; + /// Register a list of new memory regions to this [`Vm`]. + /// + /// Note: regions and state.regions need to be in the same order. + pub fn restore_memory_regions( + &mut self, + regions: Vec, + state: &GuestMemoryState, + ) -> Result<(), VmError> { + for (region, state) in regions.into_iter().zip(state.regions.iter()) { + let next_slot = self + .next_kvm_slot() + .ok_or(VmError::NotEnoughMemorySlots(self.common.max_memslots))?; - let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; + let arcd_region = Arc::new(GuestRegionMmapExt::from_state(region, state, next_slot)?); - // SAFETY: Safe because the fd is a valid KVM file descriptor. - unsafe { - self.fd() - .set_user_memory_region(memory_region) - .map_err(VmError::SetUserMemoryRegion)?; + self.register_memory_region(arcd_region)? } - self.common.guest_memory = new_guest_memory; - Ok(()) } @@ -238,28 +253,26 @@ impl Vm { /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { - self.guest_memory() - .iter() - .zip(0u32..) - .for_each(|(region, slot)| { - let _ = self.fd().get_dirty_log(slot, u64_to_usize(region.len())); - }); + self.guest_memory().iter().for_each(|region| { + let _ = self + .fd() + .get_dirty_log(region.slot, u64_to_usize(region.len())); + }); } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. pub fn get_dirty_bitmap(&self) -> Result { self.guest_memory() .iter() - .zip(0u32..) - .map(|(region, slot)| { + .map(|region| { let bitmap = match region.bitmap() { Some(_) => self .fd() - .get_dirty_log(slot, u64_to_usize(region.len())) + .get_dirty_log(region.slot, u64_to_usize(region.len())) .map_err(VmError::GetDirtyLog)?, - None => mincore_bitmap(region)?, + None => mincore_bitmap(®ion.inner)?, }; - Ok((slot, bitmap)) + Ok((region.slot, bitmap)) }) .collect() } @@ -520,7 +533,7 @@ pub(crate) mod tests { pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm) { let (kvm, mut vm) = setup_vm(); let gm = single_region_mem_raw(mem_size); - vm.register_memory_regions(gm).unwrap(); + vm.register_dram_memory_regions(gm).unwrap(); (kvm, vm) } @@ -538,14 +551,14 @@ pub(crate) mod tests { // Trying to set a memory region with a size that is not a multiple of GUEST_PAGE_SIZE // will result in error. let gm = single_region_mem_raw(0x10); - let res = vm.register_memory_regions(gm); + let res = vm.register_dram_memory_regions(gm); assert_eq!( res.unwrap_err().to_string(), "Cannot set the memory regions: Invalid argument (os error 22)" ); let gm = single_region_mem_raw(0x1000); - let res = vm.register_memory_regions(gm); + let res = vm.register_dram_memory_regions(gm); res.unwrap(); } @@ -580,7 +593,7 @@ pub(crate) mod tests { let region = GuestRegionMmap::new(region, GuestAddress(i as u64 * 0x1000)).unwrap(); - let res = vm.register_memory_region(region); + let res = vm.register_dram_memory_regions(vec![region]); if max_nr_regions <= i { assert!(