QEMU maps certain regions into the guest multiple times, as seen in the trace below. Currently the MSHV kernel driver will reject those mappings. To workaround this, a record is kept (a static global list of "slots", inspired by what the HVF accelerator has implemented). An overlapping region is not registered at the hypervisor, and marked as mapped=false. If there is an UNMAPPED_GPA exit, we can look for a slot that is unmapped and would cover the GPA. In this case we map out the conflicting slot and map in the requested region.
mshv_set_phys_mem add=1 name=pc.bios mshv_map_memory => u_a=7ffff4e00000 gpa=00fffc0000 size=00040000 mshv_set_phys_mem add=1 name=ioapic mshv_set_phys_mem add=1 name=hpet mshv_set_phys_mem add=0 name=pc.ram mshv_unmap_memory u_a=7fff67e00000 gpa=0000000000 size=80000000 mshv_set_phys_mem add=1 name=pc.ram mshv_map_memory u_a=7fff67e00000 gpa=0000000000 size=000c0000 mshv_set_phys_mem add=1 name=pc.rom mshv_map_memory u_a=7ffff4c00000 gpa=00000c0000 size=00020000 mshv_set_phys_mem add=1 name=pc.bios mshv_remap_attempt => u_a=7ffff4e20000 gpa=00000e0000 size=00020000 Signed-off-by: Magnus Kulke <magnusku...@linux.microsoft.com> --- accel/mshv/mem.c | 264 ++++++++++++++++++++++++++++++++---- accel/mshv/trace-events | 7 +- include/system/mshv.h | 16 ++- target/i386/mshv/mshv-cpu.c | 43 ++++++ 4 files changed, 295 insertions(+), 35 deletions(-) diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c index 6d7a726898..0ffe379601 100644 --- a/accel/mshv/mem.c +++ b/accel/mshv/mem.c @@ -20,44 +20,167 @@ #include <sys/ioctl.h> #include "trace.h" +MshvMemorySlot mem_slots[MSHV_MAX_MEM_SLOTS]; + +static MshvMemorySlot *find_free_mem_slot(void) +{ + for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) { + if (mem_slots[i].memory_size == 0) { + return &mem_slots[i]; + } + } + + return NULL; +} + +/* Find _currently mapped_ memory slot, that is overlapping in userspace */ +static MshvMemorySlot *find_overlap_mem_slot(const MshvMemorySlot *slot) +{ + MshvMemorySlot *other; + bool overlaps; + uint64_t start_1 = slot->userspace_addr, start_2; + size_t len_1 = slot->memory_size, len_2; + + for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) { + other = &mem_slots[i]; + + if (other == slot) { + continue; + } + + start_2 = other->userspace_addr; + len_2 = other->memory_size; + + overlaps = ranges_overlap(start_1, len_1, start_2, len_2); + if (other->mapped && overlaps) { + return other; + } + } + + return NULL; +} + static int set_guest_memory(int vm_fd, const mshv_user_mem_region *region) { int ret; ret = ioctl(vm_fd, MSHV_SET_GUEST_MEMORY, region); if (ret < 0) { - error_report("failed to set guest memory"); - return -errno; + error_report("failed to set guest memory: %s", strerror(errno)); + return -1; } return 0; } -static int map_or_unmap(int vm_fd, const MshvMemoryRegion *mr, bool map) +static int map_or_unmap(int vm_fd, const MshvMemorySlot *slot, bool map) { struct mshv_user_mem_region region = {0}; - region.guest_pfn = mr->guest_phys_addr >> MSHV_PAGE_SHIFT; - region.size = mr->memory_size; - region.userspace_addr = mr->userspace_addr; + region.guest_pfn = slot->guest_phys_addr >> MSHV_PAGE_SHIFT; + region.size = slot->memory_size; + region.userspace_addr = slot->userspace_addr; if (!map) { region.flags |= (1 << MSHV_SET_MEM_BIT_UNMAP); - trace_mshv_unmap_memory(mr->userspace_addr, mr->guest_phys_addr, - mr->memory_size); + trace_mshv_unmap_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); return set_guest_memory(vm_fd, ®ion); } region.flags = BIT(MSHV_SET_MEM_BIT_EXECUTABLE); - if (!mr->readonly) { + if (!slot->readonly) { region.flags |= BIT(MSHV_SET_MEM_BIT_WRITABLE); } - trace_mshv_map_memory(mr->userspace_addr, mr->guest_phys_addr, - mr->memory_size); + trace_mshv_map_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); return set_guest_memory(vm_fd, ®ion); } +static MshvMemorySlot *find_mem_slot_by_region(uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + MshvMemorySlot *slot; + + for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) { + slot = &mem_slots[i]; + + if (slot->guest_phys_addr == gpa && + slot->userspace_addr == userspace_addr && + slot->memory_size == size) { + trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return slot; + } + } + + return NULL; +} + +static MshvMemorySlot* find_mem_slot_by_gpa(uint64_t gpa) +{ + uint64_t gpa_offset; + MshvMemorySlot *slot; + + trace_mshv_find_slot_by_gpa(gpa); + + for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) { + slot = &mem_slots[i]; + + gpa_offset = gpa - slot->guest_phys_addr; + if (slot->guest_phys_addr <= gpa && gpa_offset < slot->memory_size) { + trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return slot; + } + } + + return NULL; +} + +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa) +{ + MshvMemorySlot *gpa_slot, *overlap_slot; + int ret; + + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + overlap_slot = find_overlap_mem_slot(gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + + /* unmap overlapping slot */ + ret = map_or_unmap(vm_fd, overlap_slot, false); + if (ret < 0) { + error_report("failed to unmap overlap region"); + abort(); + } + overlap_slot->mapped = false; + warn_report("mapped out userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + overlap_slot->userspace_addr, + overlap_slot->guest_phys_addr, + overlap_slot->memory_size); + + /* map region for gpa */ + ret = map_or_unmap(vm_fd, gpa_slot, true); + if (ret < 0) { + error_report("failed to map new region"); + abort(); + } + gpa_slot->mapped = true; + warn_report("mapped in userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + gpa_slot->userspace_addr, gpa_slot->guest_phys_addr, + gpa_slot->memory_size); + + return MshvRemapOk; +} + static int handle_unmapped_mmio_region_read(uint64_t gpa, uint64_t size, uint8_t *data) { @@ -123,20 +246,106 @@ int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, return -1; } -static int set_memory(const MshvMemoryRegion *mshv_mr, bool add) +static void clear_slot(MshvMemorySlot *slot) { - int ret = 0; + assert(slot); + + *slot = (MshvMemorySlot) { 0 }; +} + +static int tracked_unmap(int vm_fd, uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + int ret; + MshvMemorySlot *slot; + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (!slot) { + trace_mshv_skip_unset_mem(userspace_addr, gpa, size); + /* no work to do */ + return 0; + } + + if (!slot->mapped) { + /* remove slot, no need to unmap */ + clear_slot(slot); + return 0; + } + + ret = map_or_unmap(vm_fd, slot, false); + if (ret < 0) { + error_report("failed to unmap memory region"); + return ret; + } + clear_slot(slot); + + return 0; +} - if (!mshv_mr) { - error_report("Invalid mshv_mr"); +static int tracked_map(int vm_fd, uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr) +{ + MshvMemorySlot *slot, *overlap_slot; + int ret; + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (slot) { + error_report("memory region already mapped at gpa=0x%lx, " + "userspace_addr=0x%lx, size=0x%lx", + slot->guest_phys_addr, slot->userspace_addr, + slot->memory_size); + return -1; + } + + slot = find_free_mem_slot(); + if (!slot) { + error_report("no free memory slot available"); + return -1; + } + + slot->guest_phys_addr = gpa; + slot->userspace_addr = userspace_addr; + slot->memory_size = size; + slot->readonly = readonly; + + overlap_slot = find_overlap_mem_slot(slot); + if (overlap_slot) { + trace_mshv_remap_attempt(slot->userspace_addr, + slot->guest_phys_addr, + slot->memory_size); + warn_report("attempt to map region [0x%lx-0x%lx], while " + "[0x%lx-0x%lx] is already mapped in the guest", + userspace_addr, userspace_addr + size - 1, + overlap_slot->userspace_addr, + overlap_slot->userspace_addr + + overlap_slot->memory_size - 1); + + /* do not register mem slot in hv, but record for later swap-in */ + slot->mapped = false; + + return 0; + } + + ret = map_or_unmap(vm_fd, slot, true); + if (ret < 0) { + error_report("failed to map memory region"); return -1; } + slot->mapped = true; - trace_mshv_set_memory(add, mshv_mr->guest_phys_addr, - mshv_mr->memory_size, - mshv_mr->userspace_addr, mshv_mr->readonly, - ret); - return map_or_unmap(mshv_state->vm, mshv_mr, add); + return 0; +} + +static int set_memory(uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr, bool add) +{ + int vm_fd = mshv_state->vm; + + if (add) { + return tracked_map(vm_fd, gpa, size, readonly, userspace_addr); + } + + return tracked_unmap(vm_fd, gpa, size, userspace_addr); } /* @@ -172,9 +381,10 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool writable = !area->readonly && !area->rom_device; hwaddr start_addr, mr_offset, size; void *ram; - MshvMemoryRegion mshv_mr = {0}; - trace_mshv_set_phys_mem(add, section->mr->name); + size = align_section(section, &start_addr); + + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); /* If the memory device is a writable non-ram area, we do not * want to map it into the guest memory. If it is not a ROM device, @@ -188,7 +398,6 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, } } - size = align_section(section, &start_addr); if (!size) { return; } @@ -198,14 +407,9 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, ram = memory_region_get_ram_ptr(area) + mr_offset; - mshv_mr.guest_phys_addr = start_addr; - mshv_mr.memory_size = size; - mshv_mr.readonly = !writable; - mshv_mr.userspace_addr = (uint64_t)ram; - - ret = set_memory(&mshv_mr, add); + ret = set_memory(start_addr, size, !writable, (uint64_t)ram, add); if (ret < 0) { - error_report("Failed to set memory region"); + error_report("failed to set memory region"); abort(); } } diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events index bade57e22c..efd9dd7b3c 100644 --- a/accel/mshv/trace-events +++ b/accel/mshv/trace-events @@ -20,5 +20,10 @@ mshv_mem_write(uint64_t addr, size_t size) "\tgpa=%lx size=%lu" mshv_mem_read(uint64_t addr, size_t size) "\tgpa=%lx size=%lu" mshv_map_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=%lx gpa=%010lx size=%08lx" mshv_unmap_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=%lx gpa=%010lx size=%08lx" -mshv_set_phys_mem(bool add, const char *name) "\tadd=%d name=%s" +mshv_set_phys_mem(bool add, const char *name, uint64_t gpa) "\tadd=%d name=%s gpa=%lx" + +mshv_found_slot(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=%lx gpa=%010lx size=%08lx" +mshv_skip_unset_mem(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=%lx gpa=%010lx size=%08lx" +mshv_remap_attempt(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=%lx gpa=%010lx size=%08lx" +mshv_find_slot_by_gpa(uint64_t gpa) "\tgpa=%010lx" mshv_handle_mmio(uint64_t gva, uint64_t gpa, uint64_t size, uint8_t access_type) "\tgva=%lx gpa=%010lx size=%lx access_type=%d" diff --git a/include/system/mshv.h b/include/system/mshv.h index 27d7e3dff3..124da05885 100644 --- a/include/system/mshv.h +++ b/include/system/mshv.h @@ -38,6 +38,8 @@ typedef struct hyperv_message hv_message; #define MSHV_MSR_ENTRIES_COUNT 64 +#define MSHV_MAX_MEM_SLOTS 32 + #ifdef CONFIG_MSHV_IS_POSSIBLE extern bool mshv_allowed; #define mshv_enabled() (mshv_allowed) @@ -102,6 +104,12 @@ typedef enum MshvVmExit { MshvVmExitHlt = 3, } MshvVmExit; +typedef enum MshvRemapResult { + MshvRemapOk = 0, + MshvRemapNoMapping = 1, + MshvRemapNoOverlap = 2, +} MshvRemapResult; + void mshv_init_mmio_emu(void); int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); void mshv_remove_vcpu(int vm_fd, int cpu_fd); @@ -143,15 +151,15 @@ typedef struct MshvMsrEntries { int mshv_configure_msr(int cpu_fd, const MshvMsrEntry *msrs, size_t n_msrs); /* memory */ -typedef struct MshvMemoryRegion { +typedef struct MshvMemorySlot { uint64_t guest_phys_addr; uint64_t memory_size; uint64_t userspace_addr; bool readonly; -} MshvMemoryRegion; + bool mapped; +} MshvMemorySlot; -int mshv_add_mem(int vm_fd, const MshvMemoryRegion *mr); -int mshv_remove_mem(int vm_fd, const MshvMemoryRegion *mr); +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa); int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, bool is_secure_mode, bool instruction_fetch); int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 41a3398ec8..083f161274 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -1073,6 +1073,43 @@ static int handle_mmio(CPUState *cpu, const struct hyperv_message *msg, return 0; } +static int handle_unmapped_mem(int vm_fd, CPUState *cpu, + const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + uint64_t gpa; + int ret; + enum MshvRemapResult remap_result; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + + gpa = info.guest_physical_address; + + /* attempt to remap the region, in case of overlapping userspace mappings */ + remap_result = mshv_remap_overlap_region(vm_fd, gpa); + *exit_reason = MshvVmExitIgnore; + + switch (remap_result) { + case MshvRemapNoMapping: + /* if we didn't find a mapping, it is probably mmio */ + return handle_mmio(cpu, msg, exit_reason); + case MshvRemapOk: + break; + case MshvRemapNoOverlap: + /* This should not happen, but we are forgiving it */ + warn_report("found no overlap for unmapped region"); + *exit_reason = MshvVmExitSpecial; + break; + } + + return 0; +} + static int set_ioport_info(const struct hyperv_message *msg, hv_x64_io_port_intercept_message *info) { @@ -1449,6 +1486,12 @@ int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) case HVMSG_UNRECOVERABLE_EXCEPTION: return MshvVmExitShutdown; case HVMSG_UNMAPPED_GPA: + ret = handle_unmapped_mem(vm_fd, cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle unmapped memory"); + return -1; + } + return exit_reason; case HVMSG_GPA_INTERCEPT: ret = handle_mmio(cpu, msg, &exit_reason); if (ret < 0) { -- 2.34.1