From: Ankur Arora <ankur.a.ar...@oracle.com>

Introduce support for mapping grant references. The sequence of events
to map a grant is:

  rframe = read_shared_entry(guest_grant_table, grant-ref);
  rpfn = get_user_pages_remote(remote_mm, rframe);
  mark_shared_entry(guest_grant_table, grant-ref,
                     GTF_reading | GTF_writing);

To correctly handle grant unmaps for mapped grants, we save the mapping
parameters in maptrack. Also, grant map (and unmap) can be called from
non-sleeping contexts, so we call get_user_pages_remote() in
non-blocking mode and ask the user to retry.

Also note that this code is not compliant with Xen's grant map/unmap
ABI. In particular, we do not support multiple simultaneous mappings of
a grant-reference. Later versions will support that.

Co-developed-by: Joao Martins <joao.m.mart...@oracle.com>
Signed-off-by: Ankur Arora <ankur.a.ar...@oracle.com>
Signed-off-by: Joao Martins <joao.m.mart...@oracle.com>
---
 arch/x86/kvm/xen.c | 396 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 645cd22ab4e7..3603645086a7 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -9,6 +9,7 @@
 #include "xen.h"
 #include "ioapic.h"
 
+#include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/eventfd.h>
 #include <linux/sched/stat.h>
@@ -29,9 +30,11 @@
 
 /* Grant v1 references per 4K page */
 #define GPP_V1 (PAGE_SIZE / sizeof(struct grant_entry_v1))
+#define shared_entry(gt, ref)  (&((gt)[(ref) / GPP_V1][(ref) % GPP_V1]))
 
 /* Grant mappings per 4K page */
 #define MPP    (PAGE_SIZE / sizeof(struct kvm_grant_map))
+#define maptrack_entry(mt, hdl)        (&((mt)[(hdl) / MPP][(hdl) % MPP]))
 
 struct evtchnfd {
        struct eventfd_ctx *ctx;
@@ -81,6 +84,18 @@ static int kvm_xen_domid_init(struct kvm *kvm, bool any, 
domid_t domid)
        return 0;
 }
 
+static struct kvm *kvm_xen_find_vm(domid_t domid)
+{
+       unsigned long flags;
+       struct kvm *vm;
+
+       read_lock_irqsave(&domid_lock, flags);
+       vm = idr_find(&domid_to_kvm, domid);
+       read_unlock_irqrestore(&domid_lock, flags);
+
+       return vm;
+}
+
 int kvm_xen_free_domid(struct kvm *kvm)
 {
        struct kvm_xen *xen = &kvm->arch.xen;
@@ -1153,7 +1168,20 @@ int kvm_xen_gnttab_init(struct kvm *kvm, struct kvm_xen 
*xen,
        gnttab->frames = addr;
        gnttab->frames[0] = xen->gnttab.initial;
        gnttab->max_nr_frames = max_frames;
+
+       addr = kcalloc(max_mt_frames, sizeof(addr), GFP_KERNEL);
+       if (!addr)
+               goto out;
+
+       /* Needs to be aligned at 16b boundary. */
+       gnttab->handle = addr;
        gnttab->max_mt_frames = max_mt_frames;
+
+       addr = (void *) get_zeroed_page(GFP_KERNEL);
+       if (!addr)
+               goto out;
+       gnttab->handle[0] = addr;
+
        gnttab->nr_mt_frames = 1;
        gnttab->nr_frames = 0;
 
@@ -1162,6 +1190,7 @@ int kvm_xen_gnttab_init(struct kvm *kvm, struct kvm_xen 
*xen,
        return 0;
 
 out:
+       kfree(xen->gnttab.handle);
        kfree(xen->gnttab.frames);
        kfree(xen->gnttab.frames_addr);
        if (page)
@@ -1170,11 +1199,38 @@ int kvm_xen_gnttab_init(struct kvm *kvm, struct kvm_xen 
*xen,
        return -ENOMEM;
 }
 
+static void kvm_xen_maptrack_free(struct kvm_xen *xen)
+{
+       u32 max_entries = xen->gnttab.nr_mt_frames * MPP;
+       struct kvm_grant_map *map;
+       int ref, inuse = 0;
+
+       for (ref = 0; ref < max_entries; ref++) {
+               map = maptrack_entry(xen->gnttab.handle, ref);
+
+               if (test_and_clear_bit(_KVM_GNTMAP_ACTIVE,
+                                      (unsigned long *)&map->flags)) {
+                       put_page(virt_to_page(map->gpa));
+                       inuse++;
+               }
+       }
+
+       if (inuse)
+               pr_debug("kvm: dom%u teardown %u mappings\n",
+                        xen->domid, inuse);
+}
+
 void kvm_xen_gnttab_free(struct kvm_xen *xen)
 {
        struct kvm_grant_table *gnttab = &xen->gnttab;
        int i;
 
+       if (xen->domid)
+               kvm_xen_maptrack_free(xen);
+
+       for (i = 0; i < gnttab->nr_mt_frames; i++)
+               free_page((unsigned long)gnttab->handle[i]);
+
        for (i = 0; i < gnttab->nr_frames; i++)
                put_page(virt_to_page(gnttab->frames[i]));
 
@@ -1313,6 +1369,343 @@ void kvm_xen_unregister_lcall(void)
 }
 EXPORT_SYMBOL_GPL(kvm_xen_unregister_lcall);
 
+static inline int gnttab_entries(struct kvm *kvm)
+{
+       struct kvm_grant_table *gnttab = &kvm->arch.xen.gnttab;
+       int n = max_t(unsigned int, gnttab->nr_frames, 1);
+
+       return n * ((n << PAGE_SHIFT) / sizeof(struct grant_entry_v1));
+}
+
+/*
+ * The first two members of a grant entry are updated as a combined pair.
+ * The following union allows that to happen in an endian-neutral fashion.
+ * Taken from Xen.
+ */
+union grant_combo {
+       uint32_t word;
+       struct {
+               uint16_t flags;
+               domid_t  domid;
+       } shorts;
+};
+
+/* Marks a grant in use. Code largely borrowed from Xen. */
+static int set_grant_status(domid_t domid, bool readonly,
+                           struct grant_entry_v1 *shah)
+{
+       int rc = GNTST_okay;
+       union grant_combo scombo, prev_scombo, new_scombo;
+       uint16_t mask = GTF_type_mask;
+
+       /*
+        * We bound the number of times we retry CMPXCHG on memory locations
+        * that we share with a guest OS. The reason is that the guest can
+        * modify that location at a higher rate than we can
+        * read-modify-CMPXCHG, so the guest could cause us to livelock. There
+        * are a few cases where it is valid for the guest to race our updates
+        * (e.g., to change the GTF_readonly flag), so we allow a few retries
+        * before failing.
+        */
+       int retries = 0;
+
+       scombo.word = *(u32 *)shah;
+
+       /*
+        * This loop attempts to set the access (reading/writing) flags
+        * in the grant table entry.  It tries a cmpxchg on the field
+        * up to five times, and then fails under the assumption that
+        * the guest is misbehaving.
+        */
+       for (;;) {
+               /* If not already pinned, check the grant domid and type. */
+               if ((((scombo.shorts.flags & mask) != GTF_permit_access) ||
+                   (scombo.shorts.domid != domid))) {
+                       rc = GNTST_general_error;
+                       pr_err("Bad flags (%x) or dom (%d); expected d%d\n",
+                               scombo.shorts.flags, scombo.shorts.domid,
+                               domid);
+                       return rc;
+               }
+
+               new_scombo = scombo;
+               new_scombo.shorts.flags |= GTF_reading;
+
+               if (!readonly) {
+                       new_scombo.shorts.flags |= GTF_writing;
+                       if (unlikely(scombo.shorts.flags & GTF_readonly)) {
+                               rc = GNTST_general_error;
+                               pr_err("Attempt to write-pin a r/o grant 
entry\n");
+                               return rc;
+                       }
+               }
+
+               prev_scombo.word = cmpxchg((u32 *)shah,
+                                          scombo.word, new_scombo.word);
+               if (likely(prev_scombo.word == scombo.word))
+                       break;
+
+               if (retries++ == 4) {
+                       rc = GNTST_general_error;
+                       pr_err("Shared grant entry is unstable\n");
+                       return rc;
+               }
+
+               scombo = prev_scombo;
+       }
+
+       return rc;
+}
+
+#define MT_HANDLE_DOMID_SHIFT  17
+#define MT_HANDLE_DOMID_MASK   0x7fff
+#define MT_HANDLE_GREF_MASK    0x1ffff
+
+static u32 handle_get(domid_t domid, grant_ref_t ref)
+{
+       return (domid << MT_HANDLE_DOMID_SHIFT) | ref;
+}
+
+static u16 handle_get_domid(grant_handle_t handle)
+{
+       return (handle >> MT_HANDLE_DOMID_SHIFT) & MT_HANDLE_DOMID_MASK;
+}
+
+static grant_ref_t handle_get_grant(grant_handle_t handle)
+{
+       return handle & MT_HANDLE_GREF_MASK;
+}
+
+static int map_grant_nosleep(struct kvm *rd, u64 frame, bool readonly,
+                            struct page **page, u16 *err)
+{
+       unsigned long rhva;
+       int gup_flags, non_blocking;
+       int ret;
+
+       *err = GNTST_general_error;
+
+       if (!err || !page)
+               return -EINVAL;
+
+       rhva  = gfn_to_hva(rd, frame);
+       if (kvm_is_error_hva(rhva)) {
+               *err = GNTST_bad_page;
+               return -EFAULT;
+       }
+
+       gup_flags = (readonly ? 0 : FOLL_WRITE) | FOLL_NOWAIT;
+
+       /* get_user_pages will reset this were IO to be needed */
+       non_blocking = 1;
+
+       /*
+        * get_user_pages_*() family of functions can sleep if the page needs
+        * to be mapped in. However, our main consumer is the grant map
+        * hypercall and because we run in the same context as the caller
+        * (unlike a real hypercall) sleeping is not an option.
+        *
+        * This is how we avoid it:
+        *  - sleeping on mmap_sem acquisition: we handle that by acquiring the
+        *    read-lock before calling.
+        *    If mmap_sem is contended, return with GNTST_eagain.
+        *  - sync wait for pages to be swapped in: specify FOLL_NOWAIT. If IO
+        *    was needed, would be returned via @non_blocking. Return
+        *    GNTST_eagain if it is necessary and the user would retry.
+        *    Also, in the blocking case, mmap_sem will be released
+        *    asynchronously when the IO completes.
+        */
+       ret = down_read_trylock(&rd->mm->mmap_sem);
+       if (ret == 0) {
+               *err = GNTST_eagain;
+               return -EBUSY;
+       }
+
+       ret = get_user_pages_remote(rd->mm->owner, rd->mm, rhva, 1, gup_flags,
+                                   page, NULL, &non_blocking);
+       if (non_blocking)
+               up_read(&rd->mm->mmap_sem);
+
+       if (ret == 1) {
+               *err = GNTST_okay;
+       } else if (ret == 0) {
+               *err = GNTST_eagain;
+               ret = -EBUSY;
+       } else if (ret < 0) {
+               pr_err("gnttab: failed to get pfn for hva %lx, err %d\n",
+                       rhva, ret);
+               if (ret == -EFAULT) {
+                       *err = GNTST_bad_page;
+               } else if (ret == -EBUSY) {
+                       WARN_ON(non_blocking);
+                       *err = GNTST_eagain;
+               } else {
+                       *err = GNTST_general_error;
+               }
+       }
+
+       return (ret >= 0) ? 0 : ret;
+}
+
+static int shim_hcall_gntmap(struct kvm_xen *ld,
+                            struct gnttab_map_grant_ref *op)
+{
+       struct kvm_grant_map map_old, map_new, *map = NULL;
+       bool readonly = op->flags & GNTMAP_readonly;
+       struct grant_entry_v1 *shah;
+       struct page *page = NULL;
+       unsigned long host_kaddr;
+       int err = -ENOSYS;
+       struct kvm *rd;
+       kvm_pfn_t rpfn;
+       u32 frame;
+       u32 idx;
+
+       BUILD_BUG_ON(sizeof(*map) != 16);
+
+       if (unlikely((op->host_addr))) {
+               pr_err("gnttab: bad host_addr %llx in map\n", op->host_addr);
+               op->status = GNTST_bad_virt_addr;
+               return 0;
+       }
+
+       /*
+        * Make sure the guest does not try to smuggle any flags here
+        * (for instance _KVM_GNTMAP_ACTIVE.)
+        * The only allowable flag is GNTMAP_readonly.
+        */
+       if (unlikely(op->flags & ~((u16) GNTMAP_readonly))) {
+               pr_err("gnttab: bad flags %x in map\n", op->flags);
+               op->status = GNTST_bad_gntref;
+               return 0;
+       }
+
+       rd = kvm_xen_find_vm(op->dom);
+       if (unlikely(!rd)) {
+               pr_err("gnttab: could not find domain %u\n", op->dom);
+               op->status = GNTST_bad_domain;
+               return 0;
+       }
+
+       if (unlikely(op->ref >= gnttab_entries(rd))) {
+               pr_err("gnttab: bad ref %u\n", op->ref);
+               op->status = GNTST_bad_gntref;
+               return 0;
+       }
+
+       /*
+        * shah is potentially controlled by the user. We cache the frame but
+        * don't care about any changes to domid or flags since those get
+        * validated in set_grant_status() anyway.
+        *
+        * Note that if the guest changes the frame we will end up mapping the
+        * old frame.
+        */
+       shah = shared_entry(rd->arch.xen.gnttab.frames_v1, op->ref);
+       frame = READ_ONCE(shah->frame);
+
+       if (unlikely(shah->domid != ld->domid)) {
+               pr_err("gnttab: bad domain (%u != %u)\n",
+                       shah->domid, ld->domid);
+               op->status = GNTST_bad_gntref;
+               goto out;
+       }
+
+       idx = handle_get(op->dom, op->ref);
+       if (handle_get_grant(idx) < op->ref ||
+           handle_get_domid(idx) < op->dom) {
+               pr_err("gnttab: out of maptrack entries (dom %u)\n", ld->domid);
+               op->status = GNTST_general_error;
+               goto out;
+       }
+
+       map = maptrack_entry(rd->arch.xen.gnttab.handle, op->ref);
+
+       /*
+        * Cache the old map value so we can do our checks on the stable
+        * version. Once the map is done, swap the mapping with the new map.
+        */
+       map_old = *map;
+       if (map_old.flags & KVM_GNTMAP_ACTIVE) {
+               pr_err("gnttab: grant ref %u dom %u in use\n",
+                       op->ref, ld->domid);
+               op->status = GNTST_bad_gntref;
+               goto out;
+       }
+
+       err = map_grant_nosleep(rd, frame, readonly, &page, &op->status);
+       if (err) {
+               if (err != -EBUSY)
+                       op->status = GNTST_bad_gntref;
+               goto out;
+       }
+
+       err = set_grant_status(ld->domid, readonly, shah);
+       if (err != GNTST_okay) {
+               pr_err("gnttab: pin failed\n");
+               put_page(page);
+               op->status = err;
+               goto out;
+       }
+
+       rpfn = page_to_pfn(page);
+       host_kaddr = (unsigned long) pfn_to_kaddr(rpfn);
+
+       map_new.domid = op->dom;
+       map_new.ref = op->ref;
+       map_new.flags = op->flags;
+       map_new.gpa = host_kaddr;
+
+       map_new.flags |= KVM_GNTMAP_ACTIVE;
+
+       /*
+        * Protect against a grant-map that could come in between our check for
+        * KVM_GNTMAP_ACTIVE above and assuming the ownership of the mapping.
+        *
+        * Use cmpxchg_double() so we can update mapping atomically (which
+        * luckily fits in 16b.)
+        */
+       if (cmpxchg_double(&map->gpa, &map->fields,
+                       map_old.gpa, map_old.fields,
+                       map_new.gpa, map_new.fields) == false) {
+               put_page(page);
+               op->status = GNTST_bad_gntref;
+               goto out;
+       }
+
+       op->dev_bus_addr = rpfn << PAGE_SHIFT;
+       op->handle = idx;
+       op->status = GNTST_okay;
+       op->host_addr = host_kaddr;
+       return 0;
+
+out:
+       /* The error code is stored in @status. */
+       return 0;
+}
+
+static int shim_hcall_gnttab(int op, void *p, int count)
+{
+       int ret = -ENOSYS;
+       int i;
+
+       switch (op) {
+       case GNTTABOP_map_grant_ref: {
+               struct gnttab_map_grant_ref *ref = p;
+
+               for (i = 0; i < count; i++)
+                       shim_hcall_gntmap(xen_shim, ref + i);
+               ret = 0;
+               break;
+       }
+       default:
+               pr_info("lcall-gnttab:op default=%d\n", op);
+               break;
+       }
+
+       return ret;
+}
+
 static int shim_hcall_version(int op, struct xen_feature_info *fi)
 {
        if (op != XENVER_get_features || !fi || fi->submap_idx != 0)
@@ -1330,6 +1723,9 @@ static int shim_hypercall(u64 code, u64 a0, u64 a1, u64 
a2, u64 a3, u64 a4)
        int ret = -ENOSYS;
 
        switch (code) {
+       case __HYPERVISOR_grant_table_op:
+               ret = shim_hcall_gnttab((int) a0, (void *) a1, (int) a2);
+               break;
        case __HYPERVISOR_xen_version:
                ret = shim_hcall_version((int)a0, (void *)a1);
                break;
-- 
2.11.0

Reply via email to