Introduce ioctls for mapping and unmapping regions of guest memory.

Uses a table of memory 'slots' similar to KVM, but the slot
number is not visible to userspace.

For now, this simple implementation requires each new mapping to be
disjoint - the underlying hypercalls have no such restriction, and
implicitly overwrite any mappings on the pages in the specified regions.

Co-developed-by: Lillian Grassin-Drake <[email protected]>
Signed-off-by: Lillian Grassin-Drake <[email protected]>
Signed-off-by: Nuno Das Neves <[email protected]>
---
 Documentation/virt/mshv/api.rst        |  15 ++
 include/asm-generic/hyperv-tlfs.h      |  15 ++
 include/linux/mshv.h                   |  14 ++
 include/uapi/asm-generic/hyperv-tlfs.h |   9 +
 include/uapi/linux/mshv.h              |  15 ++
 virt/mshv/mshv_main.c                  | 322 ++++++++++++++++++++++++-
 6 files changed, 388 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/mshv/api.rst b/Documentation/virt/mshv/api.rst
index ce651a1738e0..530efc29d354 100644
--- a/Documentation/virt/mshv/api.rst
+++ b/Documentation/virt/mshv/api.rst
@@ -72,3 +72,18 @@ it is open - this ioctl can only be called once per open.
 This ioctl creates a guest partition, returning a file descriptor to use as a
 handle for partition ioctls.
 
+3.3 MSHV_MAP_GUEST_MEMORY and MSHV_UNMAP_GUEST_MEMORY
+-----------------------------------------------------
+:Type: partition ioctl
+:Parameters: struct mshv_user_mem_region
+:Returns: 0 on success
+
+Create a mapping from a region of process memory to a region of physical memory
+in a guest partition.
+
+Mappings must be disjoint in process address space and guest address space.
+
+Note: In the current implementation, this memory is pinned to stop the pages
+being moved by linux and subsequently clobbered by the hypervisor. So the 
region
+is backed by physical memory.
+
diff --git a/include/asm-generic/hyperv-tlfs.h 
b/include/asm-generic/hyperv-tlfs.h
index 2a49503b7396..6e5072e29897 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -149,6 +149,8 @@ struct ms_hyperv_tsc_page {
 #define HVCALL_GET_PARTITION_ID                        0x0046
 #define HVCALL_DEPOSIT_MEMORY                  0x0048
 #define HVCALL_WITHDRAW_MEMORY                 0x0049
+#define HVCALL_MAP_GPA_PAGES                   0x004b
+#define HVCALL_UNMAP_GPA_PAGES                 0x004c
 #define HVCALL_CREATE_VP                       0x004e
 #define HVCALL_GET_VP_REGISTERS                        0x0050
 #define HVCALL_SET_VP_REGISTERS                        0x0051
@@ -827,4 +829,17 @@ struct hv_delete_partition {
        u64 partition_id;
 };
 
+struct hv_map_gpa_pages {
+       u64 target_partition_id;
+       u64 target_gpa_base;
+       u32 map_flags;
+       u64 source_gpa_page_list[];
+};
+
+struct hv_unmap_gpa_pages {
+       u64 target_partition_id;
+       u64 target_gpa_base;
+       u32 unmap_flags;
+};
+
 #endif
diff --git a/include/linux/mshv.h b/include/linux/mshv.h
index fc4f35089b2c..91a742f37440 100644
--- a/include/linux/mshv.h
+++ b/include/linux/mshv.h
@@ -7,13 +7,27 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <uapi/linux/mshv.h>
 
 #define MSHV_MAX_PARTITIONS            128
+#define MSHV_MAX_MEM_REGIONS           64
+
+struct mshv_mem_region {
+       u64 size; /* bytes */
+       u64 guest_pfn;
+       u64 userspace_addr; /* start of the userspace allocated memory */
+       struct page **pages;
+};
 
 struct mshv_partition {
        u64 id;
        refcount_t ref_count;
+       struct mutex mutex;
+       struct {
+               u32 count;
+               struct mshv_mem_region slots[MSHV_MAX_MEM_REGIONS];
+       } regions;
 };
 
 struct mshv {
diff --git a/include/uapi/asm-generic/hyperv-tlfs.h 
b/include/uapi/asm-generic/hyperv-tlfs.h
index 7a858226a9c5..e7b09b9f00de 100644
--- a/include/uapi/asm-generic/hyperv-tlfs.h
+++ b/include/uapi/asm-generic/hyperv-tlfs.h
@@ -12,4 +12,13 @@
 #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED          BIT(4)
 #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED                    BIT(13)
 
+/* HV Map GPA (Guest Physical Address) Flags */
+#define HV_MAP_GPA_PERMISSIONS_NONE     0x0
+#define HV_MAP_GPA_READABLE             0x1
+#define HV_MAP_GPA_WRITABLE             0x2
+#define HV_MAP_GPA_KERNEL_EXECUTABLE    0x4
+#define HV_MAP_GPA_USER_EXECUTABLE      0x8
+#define HV_MAP_GPA_EXECUTABLE           0xC
+#define HV_MAP_GPA_PERMISSIONS_MASK     0xF
+
 #endif
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 4f8da9a6fde2..47be03ef4e86 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -18,10 +18,25 @@ struct mshv_create_partition {
        struct hv_partition_creation_properties partition_creation_properties;
 };
 
+/*
+ * Mappings can't overlap in GPA space or userspace
+ * To unmap, these fields must match an existing mapping
+ */
+struct mshv_user_mem_region {
+       __u64 size;             /* bytes */
+       __u64 guest_pfn;
+       __u64 userspace_addr;   /* start of the userspace allocated memory */
+       __u32 flags;            /* ignored on unmap */
+};
+
 #define MSHV_IOCTL 0xB8
 
 /* mshv device */
 #define MSHV_REQUEST_VERSION   _IOW(MSHV_IOCTL, 0x00, __u32)
 #define MSHV_CREATE_PARTITION  _IOW(MSHV_IOCTL, 0x01, struct 
mshv_create_partition)
 
+/* partition device */
+#define MSHV_MAP_GUEST_MEMORY  _IOW(MSHV_IOCTL, 0x02, struct 
mshv_user_mem_region)
+#define MSHV_UNMAP_GUEST_MEMORY        _IOW(MSHV_IOCTL, 0x03, struct 
mshv_user_mem_region)
+
 #endif
diff --git a/virt/mshv/mshv_main.c b/virt/mshv/mshv_main.c
index 162a1bb42a4a..ce480598e67f 100644
--- a/virt/mshv/mshv_main.c
+++ b/virt/mshv/mshv_main.c
@@ -60,6 +60,10 @@ static struct miscdevice mshv_dev = {
 
 #define HV_WITHDRAW_BATCH_SIZE (PAGE_SIZE / sizeof(u64))
 #define HV_INIT_PARTITION_DEPOSIT_PAGES 208
+#define HV_MAP_GPA_MASK                (0x0000000FFFFFFFFFULL)
+#define HV_MAP_GPA_BATCH_SIZE  \
+               (PAGE_SIZE / sizeof(struct hv_map_gpa_pages) / sizeof(u64))
+#define PIN_PAGES_BATCH_SIZE   (0x10000000 / PAGE_SIZE)
 
 static int
 hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
@@ -245,16 +249,318 @@ hv_call_delete_partition(u64 partition_id)
        return -hv_status_to_errno(status);
 }
 
+static int
+hv_call_map_gpa_pages(u64 partition_id,
+                     u64 gpa_target,
+                     u64 page_count, u32 flags,
+                     struct page **pages)
+{
+       struct hv_map_gpa_pages *input_page;
+       int status;
+       int i;
+       struct page **p;
+       u32 completed = 0;
+       u64 hypercall_status;
+       unsigned long remaining = page_count;
+       int rep_count;
+       unsigned long irq_flags;
+       int ret = 0;
+
+       while (remaining) {
+
+               rep_count = min(remaining, HV_MAP_GPA_BATCH_SIZE);
+
+               local_irq_save(irq_flags);
+               input_page = (struct hv_map_gpa_pages *)(*this_cpu_ptr(
+                       hyperv_pcpu_input_arg));
+
+               input_page->target_partition_id = partition_id;
+               input_page->target_gpa_base = gpa_target;
+               input_page->map_flags = flags;
+
+               for (i = 0, p = pages; i < rep_count; i++, p++)
+                       input_page->source_gpa_page_list[i] =
+                               page_to_pfn(*p) & HV_MAP_GPA_MASK;
+               hypercall_status = hv_do_rep_hypercall(
+                       HVCALL_MAP_GPA_PAGES, rep_count, 0, input_page, NULL);
+               local_irq_restore(irq_flags);
+
+               status = hypercall_status & HV_HYPERCALL_RESULT_MASK;
+               completed = (hypercall_status & HV_HYPERCALL_REP_COMP_MASK) >>
+                               HV_HYPERCALL_REP_COMP_OFFSET;
+
+               if (status == HV_STATUS_INSUFFICIENT_MEMORY) {
+                       ret = hv_call_deposit_pages(NUMA_NO_NODE,
+                                                   partition_id, 256);
+                       if (ret)
+                               break;
+               } else if (status != HV_STATUS_SUCCESS) {
+                       pr_err("%s: completed %llu out of %llu, %s\n",
+                              __func__,
+                              page_count - remaining, page_count,
+                              hv_status_to_string(status));
+                       ret = -hv_status_to_errno(status);
+                       break;
+               }
+
+               pages += completed;
+               remaining -= completed;
+               gpa_target += completed;
+       }
+
+       if (ret && completed) {
+               pr_err("%s: Partially succeeded; mapped regions may be in 
invalid state",
+                      __func__);
+               ret = -EBADFD;
+       }
+
+       return ret;
+}
+
+static int
+hv_call_unmap_gpa_pages(u64 partition_id,
+                       u64 gpa_target,
+                       u64 page_count, u32 flags)
+{
+       struct hv_unmap_gpa_pages *input_page;
+       int status;
+       int ret = 0;
+       u32 completed = 0;
+       u64 hypercall_status;
+       unsigned long remaining = page_count;
+       int rep_count;
+       unsigned long irq_flags;
+
+       local_irq_save(irq_flags);
+       input_page = (struct hv_unmap_gpa_pages *)(*this_cpu_ptr(
+               hyperv_pcpu_input_arg));
+
+       input_page->target_partition_id = partition_id;
+       input_page->target_gpa_base = gpa_target;
+       input_page->unmap_flags = flags;
+
+       while (remaining) {
+               rep_count = min(remaining, HV_MAP_GPA_BATCH_SIZE);
+               hypercall_status = hv_do_rep_hypercall(
+                       HVCALL_UNMAP_GPA_PAGES, rep_count, 0, input_page, NULL);
+               status = hypercall_status & HV_HYPERCALL_RESULT_MASK;
+               completed = (hypercall_status & HV_HYPERCALL_REP_COMP_MASK) >>
+                               HV_HYPERCALL_REP_COMP_OFFSET;
+               if (status != HV_STATUS_SUCCESS) {
+                       pr_err("%s: completed %llu out of %llu, %s\n",
+                              __func__,
+                              page_count - remaining, page_count,
+                              hv_status_to_string(status));
+                       ret = -hv_status_to_errno(status);
+                       break;
+               }
+
+               remaining -= completed;
+               gpa_target += completed;
+               input_page->target_gpa_base = gpa_target;
+       }
+       local_irq_restore(irq_flags);
+
+       if (ret && completed) {
+               pr_err("%s: Partially succeeded; mapped regions may be in 
invalid state",
+                      __func__);
+               ret = -EBADFD;
+       }
+
+       return ret;
+}
+
+static long
+mshv_partition_ioctl_map_memory(struct mshv_partition *partition,
+                               struct mshv_user_mem_region __user *user_mem)
+{
+       struct mshv_user_mem_region mem;
+       struct mshv_mem_region *region;
+       int completed;
+       unsigned long remaining, batch_size;
+       int i;
+       struct page **pages;
+       u64 page_count, user_start, user_end, gpfn_start, gpfn_end;
+       u64 region_page_count, region_user_start, region_user_end;
+       u64 region_gpfn_start, region_gpfn_end;
+       long ret = 0;
+
+       /* Check we have enough slots*/
+       if (partition->regions.count == MSHV_MAX_MEM_REGIONS) {
+               pr_err("%s: not enough memory region slots\n", __func__);
+               return -ENOSPC;
+       }
+
+       if (copy_from_user(&mem, user_mem, sizeof(mem)))
+               return -EFAULT;
+
+       if (!mem.size ||
+           mem.size & (PAGE_SIZE - 1) ||
+           mem.userspace_addr & (PAGE_SIZE - 1) ||
+           !access_ok(mem.userspace_addr, mem.size))
+               return -EINVAL;
+
+       /* Reject overlapping regions */
+       page_count = mem.size >> PAGE_SHIFT;
+       user_start = mem.userspace_addr;
+       user_end = mem.userspace_addr + mem.size;
+       gpfn_start = mem.guest_pfn;
+       gpfn_end = mem.guest_pfn + page_count;
+       for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+               region = &partition->regions.slots[i];
+               if (!region->size)
+                       continue;
+               region_page_count = region->size >> PAGE_SHIFT;
+               region_user_start = region->userspace_addr;
+               region_user_end = region->userspace_addr + region->size;
+               region_gpfn_start = region->guest_pfn;
+               region_gpfn_end = region->guest_pfn + region_page_count;
+
+               if (!(
+                    (user_end <= region_user_start) ||
+                    (region_user_end <= user_start))) {
+                       return -EEXIST;
+               }
+               if (!(
+                    (gpfn_end <= region_gpfn_start) ||
+                    (region_gpfn_end <= gpfn_start))) {
+                       return -EEXIST;
+               }
+       }
+
+       /* Pin the userspace pages */
+       pages = vzalloc(sizeof(struct page *) * page_count);
+       if (!pages)
+               return -ENOMEM;
+
+       remaining = page_count;
+       while (remaining) {
+               /*
+                * We need to batch this, as pin_user_pages_fast with the
+                * FOLL_LONGTERM flag does a big temporary allocation
+                * of contiguous memory
+                */
+               batch_size = min(remaining, PIN_PAGES_BATCH_SIZE);
+               completed = pin_user_pages_fast(
+                               mem.userspace_addr +
+                                       (page_count - remaining) * PAGE_SIZE,
+                               batch_size,
+                               FOLL_WRITE | FOLL_LONGTERM,
+                               &pages[page_count - remaining]);
+               if (completed < 0) {
+                       pr_err("%s: failed to pin user pages error %i\n",
+                              __func__,
+                              completed);
+                       ret = completed;
+                       goto err_unpin_pages;
+               }
+               remaining -= completed;
+       }
+
+       /* Map the pages to GPA pages */
+       ret = hv_call_map_gpa_pages(partition->id, mem.guest_pfn,
+                                   page_count, mem.flags, pages);
+       if (ret)
+               goto err_unpin_pages;
+
+       /* Install the new region */
+       for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+               if (!partition->regions.slots[i].size) {
+                       region = &partition->regions.slots[i];
+                       break;
+               }
+       }
+       region->pages = pages;
+       region->size = mem.size;
+       region->guest_pfn = mem.guest_pfn;
+       region->userspace_addr = mem.userspace_addr;
+
+       partition->regions.count++;
+
+       return 0;
+
+err_unpin_pages:
+       unpin_user_pages(pages, page_count - remaining);
+       vfree(pages);
+
+       return ret;
+}
+
+static long
+mshv_partition_ioctl_unmap_memory(struct mshv_partition *partition,
+                                 struct mshv_user_mem_region __user *user_mem)
+{
+       struct mshv_user_mem_region mem;
+       struct mshv_mem_region *region_ptr;
+       int i;
+       u64 page_count;
+       long ret;
+
+       if (!partition->regions.count)
+               return -EINVAL;
+
+       if (copy_from_user(&mem, user_mem, sizeof(mem)))
+               return -EFAULT;
+
+       /* Find matching region */
+       for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+               if (!partition->regions.slots[i].size)
+                       continue;
+               region_ptr = &partition->regions.slots[i];
+               if (region_ptr->userspace_addr == mem.userspace_addr &&
+                   region_ptr->size == mem.size &&
+                   region_ptr->guest_pfn == mem.guest_pfn)
+                       break;
+       }
+
+       if (i == MSHV_MAX_MEM_REGIONS)
+               return -EINVAL;
+
+       page_count = region_ptr->size >> PAGE_SHIFT;
+       ret = hv_call_unmap_gpa_pages(partition->id, region_ptr->guest_pfn,
+                                     page_count, 0);
+       if (ret)
+               return ret;
+
+       unpin_user_pages(region_ptr->pages, page_count);
+       vfree(region_ptr->pages);
+       memset(region_ptr, 0, sizeof(*region_ptr));
+       partition->regions.count--;
+
+       return 0;
+}
+
 static long
 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
-       return -ENOTTY;
+       struct mshv_partition *partition = filp->private_data;
+       long ret;
+
+       if (mutex_lock_killable(&partition->mutex))
+               return -EINTR;
+
+       switch (ioctl) {
+       case MSHV_MAP_GUEST_MEMORY:
+               ret = mshv_partition_ioctl_map_memory(partition,
+                                                       (void __user *)arg);
+               break;
+       case MSHV_UNMAP_GUEST_MEMORY:
+               ret = mshv_partition_ioctl_unmap_memory(partition,
+                                                       (void __user *)arg);
+               break;
+       default:
+               ret = -ENOTTY;
+       }
+
+       mutex_unlock(&partition->mutex);
+       return ret;
 }
 
 static void
 destroy_partition(struct mshv_partition *partition)
 {
-       unsigned long flags;
+       unsigned long flags, page_count;
+       struct mshv_mem_region *region;
        int i;
 
        /* Remove from list of partitions */
@@ -286,6 +592,16 @@ destroy_partition(struct mshv_partition *partition)
 
        hv_call_delete_partition(partition->id);
 
+       /* Remove regions and unpin the pages */
+       for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+               region = &partition->regions.slots[i];
+               if (!region->size)
+                       continue;
+               page_count = region->size >> PAGE_SHIFT;
+               unpin_user_pages(region->pages, page_count);
+               vfree(region->pages);
+       }
+
        kfree(partition);
 }
 
@@ -353,6 +669,8 @@ mshv_ioctl_create_partition(void __user *user_arg)
        if (!partition)
                return -ENOMEM;
 
+       mutex_init(&partition->mutex);
+
        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0) {
                ret = fd;
-- 
2.25.1

Reply via email to