A VFIO DMABUF can export a subset of a BAR to userspace by fd; add
support for mmap() of this fd.  This provides another route for a
process to map BARs, except one where the process can only map a specific
subset of a BAR represented by the exported DMABUF.

mmap() support enables userspace driver designs that safely delegate
access to BAR sub-ranges to other client processes by sharing a DMABUF
fd, without having to share the (omnipotent) VFIO device fd with them.

The mmap callback installs vm_ops callbacks for .fault and .huge_fault;
they find a PFN by searching the DMABUF's physical ranges.  That is,
DMABUFs with multiple ranges are supported for mmap().

Signed-off-by: Matt Evans <[email protected]>
---
 drivers/vfio/pci/vfio_pci_dmabuf.c | 219 +++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c 
b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 46ab64fbeb19..bebb496bd0f2 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -85,6 +85,209 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
        kfree(priv);
 }
 
+static int vfio_pci_dma_buf_find_pfn(struct device *dev,
+                                    struct vfio_pci_dma_buf *vpdmabuf,
+                                    struct vm_area_struct *vma,
+                                    unsigned long address,
+                                    unsigned int order,
+                                    unsigned long *out_pfn)
+{
+       /*
+        * Given a VMA (start, end, pgoffs) and a fault address,
+        * search phys_vec[] to find the range representing the
+        * address's offset into the VMA (and so a PFN).
+        *
+        * The phys_vec ranges represent contiguous spans of VAs
+        * upwards from the buffer offset 0; the actual PFNs might be
+        * in any order, overlap/alias, etc.  Calculate an offset of
+        * the desired page given VMA start/pgoff and address, then
+        * search upwards from 0 to find which span contains it.
+        *
+        * On success, a valid PFN for a page sized by 'order' is
+        * returned into out_pfn.
+        *
+        * Failure occurs if:
+        * - The page would cross the edge of the VMA
+        * - The page isn't entirely contained within a range
+        * - We find a range, but the final PFN isn't aligned to the
+        *   requested order.
+        *
+        * (Upon failure, the caller is expected to try again with a
+        * smaller order; the tests above will always succeed for
+        * order=0 as the limit case.)
+        *
+        * It's suboptimal if DMABUFs are created with neigbouring
+        * ranges that are physically contiguous, since hugepages
+        * can't straddle range boundaries.  (The construction of the
+        * ranges vector should merge such ranges.)
+        */
+
+       unsigned long rounded_page_addr = address & ~((PAGE_SIZE << order) - 1);
+       unsigned long rounded_page_end = rounded_page_addr + (PAGE_SIZE << 
order);
+       unsigned long buf_page_offset;
+       unsigned long buf_offset = 0;
+       unsigned int i;
+
+       if (rounded_page_addr < vma->vm_start || rounded_page_end > vma->vm_end)
+               return -EAGAIN;
+
+       if (unlikely(check_add_overflow(rounded_page_addr - vma->vm_start,
+                                       vma->vm_pgoff << PAGE_SHIFT, 
&buf_page_offset)))
+               return -EFAULT;
+
+       for (i = 0; i < vpdmabuf->nr_ranges; i++) {
+               unsigned long range_len = vpdmabuf->phys_vec[i].len;
+               unsigned long range_start = vpdmabuf->phys_vec[i].paddr;
+
+               if (buf_page_offset >= buf_offset &&
+                   buf_page_offset + (PAGE_SIZE << order) <= buf_offset + 
range_len) {
+                       /*
+                        * The faulting page is wholly contained
+                        * within the span represented by the range.
+                        * Validate PFN alignment for the order:
+                        */
+                       unsigned long pfn = (range_start >> PAGE_SHIFT) +
+                               ((buf_page_offset - buf_offset) >> PAGE_SHIFT);
+
+                       if (IS_ALIGNED(pfn, 1 << order)) {
+                               *out_pfn = pfn;
+                               return 0;
+                       }
+                       /* Retry with smaller order */
+                       return -EAGAIN;
+               }
+               buf_offset += range_len;
+       }
+
+       /*
+        * If we get here, the address fell outside of the span
+        * represented by the (concatenated) ranges.  This can
+        * never happen because vfio_pci_dma_buf_mmap() checks that
+        * the VMA is <= the total size of the ranges.
+        *
+        * But if it does, force SIGBUS for the access, and warn.
+        */
+       WARN_ONCE(1, "No range for addr 0x%lx, order %d: VMA 0x%lx-0x%lx pgoff 
0x%lx, %d ranges, size 0x%lx\n",
+                 address, order, vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                 vpdmabuf->nr_ranges, vpdmabuf->size);
+
+       return -EFAULT;
+}
+
+static vm_fault_t vfio_pci_dma_buf_mmap_huge_fault(struct vm_fault *vmf,
+                                                  unsigned int order)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct vfio_pci_dma_buf *priv = vma->vm_private_data;
+       struct vfio_pci_core_device *vdev;
+       unsigned long pfn;
+       vm_fault_t ret = VM_FAULT_FALLBACK;
+
+       vdev = READ_ONCE(priv->vdev);
+
+       /*
+        * A fault for an existing mmap might occur after
+        * vfio_pci_dma_buf_cleanup() has revoked and destroyed the
+        * vdev's DMABUFs, and annulled vdev.  After creation, vdev is
+        * only ever written in cleanup.
+        */
+       if (!vdev)
+               return VM_FAULT_SIGBUS;
+
+       int r = vfio_pci_dma_buf_find_pfn(&vdev->pdev->dev, priv, vma,
+                                         vmf->address, order, &pfn);
+
+       if (r == 0) {
+               scoped_guard(rwsem_read, &vdev->memory_lock) {
+                       /* Deal with the possibility of a fault racing
+                        * with vfio_pci_dma_buf_move() revoking and
+                        * then unmapping the buffer.  The
+                        * revocation/unmap and status change occurs
+                        * whilst holding memory_lock.
+                        */
+                       if (priv->revoked)
+                               ret = VM_FAULT_SIGBUS;
+                       else
+                               ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, 
order);
+               }
+       } else if (r != -EAGAIN) {
+               ret = VM_FAULT_SIGBUS;
+       }
+
+       dev_dbg_ratelimited(&vdev->pdev->dev,
+                           "%s(order = %d) PFN 0x%lx, VA 0x%lx, pgoff 0x%lx: 
0x%x\n",
+                           __func__, order, pfn, vmf->address, vma->vm_pgoff, 
(unsigned int)ret);
+
+       return ret;
+}
+
+static vm_fault_t vfio_pci_dma_buf_mmap_page_fault(struct vm_fault *vmf)
+{
+       return vfio_pci_dma_buf_mmap_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct vfio_pci_dma_buf_mmap_ops = {
+       .fault = vfio_pci_dma_buf_mmap_page_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+       .huge_fault = vfio_pci_dma_buf_mmap_huge_fault,
+#endif
+};
+
+static bool vfio_pci_dma_buf_is_mappable(struct dma_buf *dmabuf)
+{
+       struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+       /*
+        * Sanity checks at mmap() time; alignment has already been
+        * asserted by validate_dmabuf_input().
+        *
+        * Although the revoked state is transient, refuse to map a
+        * revoked buffer to flag early that something odd is going
+        * on: for example, users should not be mmap()ing a buffer
+        * that's being moved [by a user-triggered activity].
+        */
+       if (priv->revoked)
+               return false;
+
+       return true;
+}
+
+/*
+ * Similar to vfio_pci_core_mmap() for a regular VFIO device fd, but
+ * differs by pre-checks performed and ultimately the vm_ops installed.
+ */
+static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct 
*vma)
+{
+       struct vfio_pci_dma_buf *priv = dmabuf->priv;
+       u64 req_len, req_start;
+
+       if (!vfio_pci_dma_buf_is_mappable(dmabuf))
+               return -ENODEV;
+       if ((vma->vm_flags & VM_SHARED) == 0)
+               return -EINVAL;
+
+       req_len = vma->vm_end - vma->vm_start;
+       req_start = vma->vm_pgoff << PAGE_SHIFT;
+
+       if (req_start + req_len > priv->size)
+               return -EINVAL;
+
+       vma->vm_private_data = priv;
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+       /*
+        * See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED.
+        *
+        * FIXME: get mapping attributes from dmabuf?
+        */
+       vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+                    VM_DONTEXPAND | VM_DONTDUMP);
+       vma->vm_ops = &vfio_pci_dma_buf_mmap_ops;
+
+       return 0;
+}
+
 static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
        .pin = vfio_pci_dma_buf_pin,
        .unpin = vfio_pci_dma_buf_unpin,
@@ -92,6 +295,7 @@ static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
        .map_dma_buf = vfio_pci_dma_buf_map,
        .unmap_dma_buf = vfio_pci_dma_buf_unmap,
        .release = vfio_pci_dma_buf_release,
+       .mmap = vfio_pci_dma_buf_mmap,
 };
 
 /*
@@ -335,6 +539,11 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device 
*vdev, bool revoked)
        struct vfio_pci_dma_buf *tmp;
 
        lockdep_assert_held_write(&vdev->memory_lock);
+       /*
+        * Holding memory_lock ensures a racing
+        * vfio_pci_dma_buf_mmap_*_fault() observes priv->revoked
+        * properly.
+        */
 
        list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
                if (!get_file_active(&priv->dmabuf->file))
@@ -345,6 +554,14 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device 
*vdev, bool revoked)
                        priv->revoked = revoked;
                        dma_buf_move_notify(priv->dmabuf);
                        dma_resv_unlock(priv->dmabuf->resv);
+
+                       /*
+                        * Unmap any possible userspace mappings for a
+                        * now-revoked DMABUF:
+                        */
+                       if (revoked)
+                               
unmap_mapping_range(priv->dmabuf->file->f_mapping,
+                                                   0, priv->size, 1);
                }
                fput(priv->dmabuf->file);
        }
@@ -366,6 +583,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device 
*vdev)
                priv->revoked = true;
                dma_buf_move_notify(priv->dmabuf);
                dma_resv_unlock(priv->dmabuf->resv);
+               unmap_mapping_range(priv->dmabuf->file->f_mapping,
+                                   0, priv->size, 1);
                vfio_device_put_registration(&vdev->vdev);
                fput(priv->dmabuf->file);
        }
-- 
2.47.3

Reply via email to