QEMU maps certain regions into the guest multiple times, as seen in the
trace below. Currently the MSHV kernel driver will reject those
mappings. To workaround this, a record is kept (a static global list of
"slots", inspired by what the HVF accelerator has implemented). An
overlapping region is not registered at the hypervisor, and marked as
mapped=false. If there is an UNMAPPED_GPA exit, we can look for a slot
that is unmapped and would cover the GPA. In this case we map out the
conflicting slot and map in the requested region.

mshv_set_phys_mem       add=1 name=pc.bios
mshv_map_memory      => u_a=7ffff4e00000 gpa=00fffc0000 size=00040000
mshv_set_phys_mem       add=1 name=ioapic
mshv_set_phys_mem       add=1 name=hpet
mshv_set_phys_mem       add=0 name=pc.ram
mshv_unmap_memory       u_a=7fff67e00000 gpa=0000000000 size=80000000
mshv_set_phys_mem       add=1 name=pc.ram
mshv_map_memory         u_a=7fff67e00000 gpa=0000000000 size=000c0000
mshv_set_phys_mem       add=1 name=pc.rom
mshv_map_memory         u_a=7ffff4c00000 gpa=00000c0000 size=00020000
mshv_set_phys_mem       add=1 name=pc.bios
mshv_remap_attempt   => u_a=7ffff4e20000 gpa=00000e0000 size=00020000

Signed-off-by: Magnus Kulke <magnusku...@linux.microsoft.com>
---
 accel/mshv/mem.c            | 264 ++++++++++++++++++++++++++++++++----
 accel/mshv/trace-events     |   7 +-
 include/system/mshv.h       |  16 ++-
 target/i386/mshv/mshv-cpu.c |  43 ++++++
 4 files changed, 295 insertions(+), 35 deletions(-)

diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c
index 6d7a726898..0ffe379601 100644
--- a/accel/mshv/mem.c
+++ b/accel/mshv/mem.c
@@ -20,44 +20,167 @@
 #include <sys/ioctl.h>
 #include "trace.h"
 
+MshvMemorySlot mem_slots[MSHV_MAX_MEM_SLOTS];
+
+static MshvMemorySlot *find_free_mem_slot(void)
+{
+    for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) {
+        if (mem_slots[i].memory_size == 0) {
+            return &mem_slots[i];
+        }
+    }
+
+    return NULL;
+}
+
+/* Find _currently mapped_ memory slot, that is overlapping in userspace */
+static MshvMemorySlot *find_overlap_mem_slot(const MshvMemorySlot *slot)
+{
+    MshvMemorySlot *other;
+    bool overlaps;
+    uint64_t start_1 = slot->userspace_addr, start_2;
+    size_t len_1 = slot->memory_size, len_2;
+
+    for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) {
+        other = &mem_slots[i];
+
+        if (other == slot) {
+            continue;
+        }
+
+        start_2 = other->userspace_addr;
+        len_2 = other->memory_size;
+
+        overlaps = ranges_overlap(start_1, len_1, start_2, len_2);
+        if (other->mapped && overlaps) {
+            return other;
+        }
+    }
+
+    return NULL;
+}
+
 static int set_guest_memory(int vm_fd, const mshv_user_mem_region *region)
 {
     int ret;
 
     ret = ioctl(vm_fd, MSHV_SET_GUEST_MEMORY, region);
     if (ret < 0) {
-        error_report("failed to set guest memory");
-        return -errno;
+        error_report("failed to set guest memory: %s", strerror(errno));
+        return -1;
     }
 
     return 0;
 }
 
-static int map_or_unmap(int vm_fd, const MshvMemoryRegion *mr, bool map)
+static int map_or_unmap(int vm_fd, const MshvMemorySlot *slot, bool map)
 {
     struct mshv_user_mem_region region = {0};
 
-    region.guest_pfn = mr->guest_phys_addr >> MSHV_PAGE_SHIFT;
-    region.size = mr->memory_size;
-    region.userspace_addr = mr->userspace_addr;
+    region.guest_pfn = slot->guest_phys_addr >> MSHV_PAGE_SHIFT;
+    region.size = slot->memory_size;
+    region.userspace_addr = slot->userspace_addr;
 
     if (!map) {
         region.flags |= (1 << MSHV_SET_MEM_BIT_UNMAP);
-        trace_mshv_unmap_memory(mr->userspace_addr, mr->guest_phys_addr,
-                                mr->memory_size);
+        trace_mshv_unmap_memory(slot->userspace_addr, slot->guest_phys_addr,
+                                slot->memory_size);
         return set_guest_memory(vm_fd, &region);
     }
 
     region.flags = BIT(MSHV_SET_MEM_BIT_EXECUTABLE);
-    if (!mr->readonly) {
+    if (!slot->readonly) {
         region.flags |= BIT(MSHV_SET_MEM_BIT_WRITABLE);
     }
 
-    trace_mshv_map_memory(mr->userspace_addr, mr->guest_phys_addr,
-                          mr->memory_size);
+    trace_mshv_map_memory(slot->userspace_addr, slot->guest_phys_addr,
+                          slot->memory_size);
     return set_guest_memory(vm_fd, &region);
 }
 
+static MshvMemorySlot *find_mem_slot_by_region(uint64_t gpa, uint64_t size,
+                                               uint64_t userspace_addr)
+{
+    MshvMemorySlot *slot;
+
+    for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) {
+        slot = &mem_slots[i];
+
+        if (slot->guest_phys_addr == gpa &&
+            slot->userspace_addr  == userspace_addr &&
+            slot->memory_size     == size) {
+            trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr,
+                                  slot->memory_size);
+            return slot;
+        }
+    }
+
+    return NULL;
+}
+
+static MshvMemorySlot* find_mem_slot_by_gpa(uint64_t gpa)
+{
+    uint64_t gpa_offset;
+    MshvMemorySlot *slot;
+
+    trace_mshv_find_slot_by_gpa(gpa);
+
+    for (int i = 0; i < MSHV_MAX_MEM_SLOTS; i++) {
+        slot = &mem_slots[i];
+
+        gpa_offset = gpa - slot->guest_phys_addr;
+        if (slot->guest_phys_addr <= gpa && gpa_offset < slot->memory_size) {
+            trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr,
+                                  slot->memory_size);
+            return slot;
+        }
+    }
+
+    return NULL;
+}
+
+MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa)
+{
+    MshvMemorySlot *gpa_slot, *overlap_slot;
+    int ret;
+
+    /* return early if no slot is found */
+    gpa_slot = find_mem_slot_by_gpa(gpa);
+    if (gpa_slot == NULL) {
+        return MshvRemapNoMapping;
+    }
+
+    overlap_slot = find_overlap_mem_slot(gpa_slot);
+    if (overlap_slot == NULL) {
+        return MshvRemapNoOverlap;
+    }
+
+    /* unmap overlapping slot */
+    ret = map_or_unmap(vm_fd, overlap_slot, false);
+    if (ret < 0) {
+        error_report("failed to unmap overlap region");
+        abort();
+    }
+    overlap_slot->mapped = false;
+    warn_report("mapped out userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx",
+                overlap_slot->userspace_addr,
+                overlap_slot->guest_phys_addr,
+                overlap_slot->memory_size);
+
+    /* map region for gpa */
+    ret = map_or_unmap(vm_fd, gpa_slot, true);
+    if (ret < 0) {
+        error_report("failed to map new region");
+        abort();
+    }
+    gpa_slot->mapped = true;
+    warn_report("mapped in  userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx",
+                gpa_slot->userspace_addr, gpa_slot->guest_phys_addr,
+                gpa_slot->memory_size);
+
+    return MshvRemapOk;
+}
+
 static int handle_unmapped_mmio_region_read(uint64_t gpa, uint64_t size,
                                             uint8_t *data)
 {
@@ -123,20 +246,106 @@ int mshv_guest_mem_write(uint64_t gpa, const uint8_t 
*data, uintptr_t size,
     return -1;
 }
 
-static int set_memory(const MshvMemoryRegion *mshv_mr, bool add)
+static void clear_slot(MshvMemorySlot *slot)
 {
-    int ret = 0;
+    assert(slot);
+
+    *slot = (MshvMemorySlot) { 0 };
+}
+
+static int tracked_unmap(int vm_fd, uint64_t gpa, uint64_t size,
+                        uint64_t userspace_addr)
+{
+    int ret;
+    MshvMemorySlot *slot;
+
+    slot = find_mem_slot_by_region(gpa, size, userspace_addr);
+    if (!slot) {
+        trace_mshv_skip_unset_mem(userspace_addr, gpa, size);
+        /* no work to do */
+        return 0;
+    }
+
+    if (!slot->mapped) {
+        /* remove slot, no need to unmap */
+        clear_slot(slot);
+        return 0;
+    }
+
+    ret = map_or_unmap(vm_fd, slot, false);
+    if (ret < 0) {
+        error_report("failed to unmap memory region");
+        return ret;
+    }
+    clear_slot(slot);
+
+    return 0;
+}
 
-    if (!mshv_mr) {
-        error_report("Invalid mshv_mr");
+static int tracked_map(int vm_fd, uint64_t gpa, uint64_t size, bool readonly,
+                       uint64_t userspace_addr)
+{
+    MshvMemorySlot *slot, *overlap_slot;
+    int ret;
+
+    slot = find_mem_slot_by_region(gpa, size, userspace_addr);
+    if (slot) {
+        error_report("memory region already mapped at gpa=0x%lx, "
+                     "userspace_addr=0x%lx, size=0x%lx",
+                     slot->guest_phys_addr, slot->userspace_addr,
+                     slot->memory_size);
+        return -1;
+    }
+
+    slot = find_free_mem_slot();
+    if (!slot) {
+        error_report("no free memory slot available");
+        return -1;
+    }
+
+    slot->guest_phys_addr = gpa;
+    slot->userspace_addr = userspace_addr;
+    slot->memory_size = size;
+    slot->readonly = readonly;
+
+    overlap_slot = find_overlap_mem_slot(slot);
+    if (overlap_slot) {
+        trace_mshv_remap_attempt(slot->userspace_addr,
+                                 slot->guest_phys_addr,
+                                 slot->memory_size);
+        warn_report("attempt to map region [0x%lx-0x%lx], while "
+                    "[0x%lx-0x%lx] is already mapped in the guest",
+                    userspace_addr, userspace_addr + size - 1,
+                    overlap_slot->userspace_addr,
+                    overlap_slot->userspace_addr +
+                    overlap_slot->memory_size - 1);
+
+        /* do not register mem slot in hv, but record for later swap-in */
+        slot->mapped = false;
+
+        return 0;
+    }
+
+    ret = map_or_unmap(vm_fd, slot, true);
+    if (ret < 0) {
+        error_report("failed to map memory region");
         return -1;
     }
+    slot->mapped = true;
 
-    trace_mshv_set_memory(add, mshv_mr->guest_phys_addr,
-                          mshv_mr->memory_size,
-                          mshv_mr->userspace_addr, mshv_mr->readonly,
-                          ret);
-    return map_or_unmap(mshv_state->vm, mshv_mr, add);
+    return 0;
+}
+
+static int set_memory(uint64_t gpa, uint64_t size, bool readonly,
+                      uint64_t userspace_addr, bool add)
+{
+    int vm_fd = mshv_state->vm;
+
+    if (add) {
+        return tracked_map(vm_fd, gpa, size, readonly, userspace_addr);
+    }
+
+    return tracked_unmap(vm_fd, gpa, size, userspace_addr);
 }
 
 /*
@@ -172,9 +381,10 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, 
MemoryRegionSection *section,
     bool writable = !area->readonly && !area->rom_device;
     hwaddr start_addr, mr_offset, size;
     void *ram;
-    MshvMemoryRegion mshv_mr = {0};
 
-    trace_mshv_set_phys_mem(add, section->mr->name);
+    size = align_section(section, &start_addr);
+
+    trace_mshv_set_phys_mem(add, section->mr->name, start_addr);
 
     /* If the memory device is a writable non-ram area, we do not
      * want to map it into the guest memory. If it is not a ROM device,
@@ -188,7 +398,6 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, 
MemoryRegionSection *section,
         }
     }
 
-    size = align_section(section, &start_addr);
     if (!size) {
         return;
     }
@@ -198,14 +407,9 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, 
MemoryRegionSection *section,
 
     ram = memory_region_get_ram_ptr(area) + mr_offset;
 
-    mshv_mr.guest_phys_addr = start_addr;
-    mshv_mr.memory_size = size;
-    mshv_mr.readonly = !writable;
-    mshv_mr.userspace_addr = (uint64_t)ram;
-
-    ret = set_memory(&mshv_mr, add);
+    ret = set_memory(start_addr, size, !writable, (uint64_t)ram, add);
     if (ret < 0) {
-        error_report("Failed to set memory region");
+        error_report("failed to set memory region");
         abort();
     }
 }
diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events
index bade57e22c..efd9dd7b3c 100644
--- a/accel/mshv/trace-events
+++ b/accel/mshv/trace-events
@@ -20,5 +20,10 @@ mshv_mem_write(uint64_t addr, size_t size) "\tgpa=%lx 
size=%lu"
 mshv_mem_read(uint64_t addr, size_t size) "\tgpa=%lx size=%lu"
 mshv_map_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) 
"\tu_a=%lx gpa=%010lx size=%08lx"
 mshv_unmap_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) 
"\tu_a=%lx gpa=%010lx size=%08lx"
-mshv_set_phys_mem(bool add, const char *name) "\tadd=%d name=%s"
+mshv_set_phys_mem(bool add, const char *name, uint64_t gpa) "\tadd=%d name=%s 
gpa=%lx"
+
+mshv_found_slot(uint64_t userspace_addr, uint64_t gpa, uint64_t size) 
"\tu_a=%lx gpa=%010lx size=%08lx"
+mshv_skip_unset_mem(uint64_t userspace_addr, uint64_t gpa, uint64_t size) 
"\tu_a=%lx gpa=%010lx size=%08lx"
+mshv_remap_attempt(uint64_t userspace_addr, uint64_t gpa, uint64_t size) 
"\tu_a=%lx gpa=%010lx size=%08lx"
+mshv_find_slot_by_gpa(uint64_t gpa) "\tgpa=%010lx"
 mshv_handle_mmio(uint64_t gva, uint64_t gpa, uint64_t size, uint8_t 
access_type) "\tgva=%lx gpa=%010lx size=%lx access_type=%d"
diff --git a/include/system/mshv.h b/include/system/mshv.h
index 27d7e3dff3..124da05885 100644
--- a/include/system/mshv.h
+++ b/include/system/mshv.h
@@ -38,6 +38,8 @@ typedef struct hyperv_message hv_message;
 
 #define MSHV_MSR_ENTRIES_COUNT 64
 
+#define MSHV_MAX_MEM_SLOTS 32
+
 #ifdef CONFIG_MSHV_IS_POSSIBLE
 extern bool mshv_allowed;
 #define mshv_enabled() (mshv_allowed)
@@ -102,6 +104,12 @@ typedef enum MshvVmExit {
     MshvVmExitHlt      = 3,
 } MshvVmExit;
 
+typedef enum MshvRemapResult {
+    MshvRemapOk = 0,
+    MshvRemapNoMapping = 1,
+    MshvRemapNoOverlap = 2,
+} MshvRemapResult;
+
 void mshv_init_mmio_emu(void);
 int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd);
 void mshv_remove_vcpu(int vm_fd, int cpu_fd);
@@ -143,15 +151,15 @@ typedef struct MshvMsrEntries {
 int mshv_configure_msr(int cpu_fd, const MshvMsrEntry *msrs, size_t n_msrs);
 
 /* memory */
-typedef struct MshvMemoryRegion {
+typedef struct MshvMemorySlot {
     uint64_t guest_phys_addr;
     uint64_t memory_size;
     uint64_t userspace_addr;
     bool readonly;
-} MshvMemoryRegion;
+    bool mapped;
+} MshvMemorySlot;
 
-int mshv_add_mem(int vm_fd, const MshvMemoryRegion *mr);
-int mshv_remove_mem(int vm_fd, const MshvMemoryRegion *mr);
+MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa);
 int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size,
                         bool is_secure_mode, bool instruction_fetch);
 int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size,
diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c
index 41a3398ec8..083f161274 100644
--- a/target/i386/mshv/mshv-cpu.c
+++ b/target/i386/mshv/mshv-cpu.c
@@ -1073,6 +1073,43 @@ static int handle_mmio(CPUState *cpu, const struct 
hyperv_message *msg,
     return 0;
 }
 
+static int handle_unmapped_mem(int vm_fd, CPUState *cpu,
+                               const struct hyperv_message *msg,
+                               MshvVmExit *exit_reason)
+{
+    struct hv_x64_memory_intercept_message info = { 0 };
+    uint64_t gpa;
+    int ret;
+    enum MshvRemapResult remap_result;
+
+    ret = set_memory_info(msg, &info);
+    if (ret < 0) {
+        error_report("failed to convert message to memory info");
+        return -1;
+    }
+
+    gpa = info.guest_physical_address;
+
+    /* attempt to remap the region, in case of overlapping userspace mappings 
*/
+    remap_result = mshv_remap_overlap_region(vm_fd, gpa);
+    *exit_reason = MshvVmExitIgnore;
+
+    switch (remap_result) {
+    case MshvRemapNoMapping:
+        /* if we didn't find a mapping, it is probably mmio */
+        return handle_mmio(cpu, msg, exit_reason);
+    case MshvRemapOk:
+        break;
+    case MshvRemapNoOverlap:
+        /* This should not happen, but we are forgiving it */
+        warn_report("found no overlap for unmapped region");
+        *exit_reason = MshvVmExitSpecial;
+        break;
+    }
+
+    return 0;
+}
+
 static int set_ioport_info(const struct hyperv_message *msg,
                            hv_x64_io_port_intercept_message *info)
 {
@@ -1449,6 +1486,12 @@ int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message 
*msg, MshvVmExit *exit)
     case HVMSG_UNRECOVERABLE_EXCEPTION:
         return MshvVmExitShutdown;
     case HVMSG_UNMAPPED_GPA:
+        ret = handle_unmapped_mem(vm_fd, cpu, msg, &exit_reason);
+        if (ret < 0) {
+            error_report("failed to handle unmapped memory");
+            return -1;
+        }
+        return exit_reason;
     case HVMSG_GPA_INTERCEPT:
         ret = handle_mmio(cpu, msg, &exit_reason);
         if (ret < 0) {
-- 
2.34.1


Reply via email to