On Wed, Jun 10, 2026 at 04:43:21PM +0100, Matt Evans wrote:

Hi Matt,

[...]

> +      *
> +      * With the goal of taking vdev->memory_lock in a world where
> +      * vdev might not still exist:
> +      *
> +      * 1. Take the resv lock on the DMABUF:
> +      *  - If racing cleanup got in first, the buffer is revoked;
> +      *    stop/exit if so.
> +      *  - If we got in first, the buffer is not revoked so vdev is
> +      *    non-NULL, accessible, and cleanup _has not yet put the
> +      *    VFIO device registration_.  So, the device refcount must
> +      *    be >0.
> +      *
> +      * 2. Take vfio_device registration (refcount guaranteed >0
> +      *    hereafter).
> +      *
> +      * 3. Unlock the DMABUF's resv lock:
> +      *  - A racing cleanup can now complete.
> +      *  - But, the device refcount >0, meaning the vfio_device
> +      *    (and vfio_pcie_core device vdev) have not yet been
> +      *    freed.  vdev is accessible, even if the DMABUF has been
> +      *    revoked or cleanup has happened, because
> +      *    vfio_unregister_group_dev() can't complete.
> +      *
> +      * 4. Take the vdev->memory_lock
> +      *  - Either the DMABUF is usable, or has been cleaned up.
> +      *    Whichever, it can no longer change under us.
> +      *  - Test the DMABUF revocation status again: if it was
> +      *    revoked between 1 and 4 return a SIGBUS. Otherwise,
> +      *    return a PFN.
> +      *  - It's not necessary to also take the resv lock, because
> +      *    the status/vdev can't change while memory_lock is held.
> +      *
> +      * 5. Unlock, done.
>        */
> +
> +     dma_resv_lock(priv->dmabuf->resv, NULL);
> +
> +     if (priv->revoked) {
> +             pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF 
> revoked/cleaned up\n",
> +                                  __func__, vmf->address, vma->vm_pgoff);
> +             dma_resv_unlock(priv->dmabuf->resv);
> +             return VM_FAULT_SIGBUS;
> +     }
> +
> +     /* If the buffer isn't revoked, vdev is valid */
>       vdev = priv->vdev;
>  
> +     if (!vfio_device_try_get_registration(&vdev->vdev)) {
> +             /*
> +              * If vdev != NULL (above), the registration should
> +              * already be >0 and so this try_get should never
> +              * fail.
> +              */
> +             dev_warn(&vdev->pdev->dev, "%s: Unexpected registration 
> failure\n",
> +                      __func__);
> +             dma_resv_unlock(priv->dmabuf->resv);
> +             return VM_FAULT_SIGBUS;
> +     }
> +     dma_resv_unlock(priv->dmabuf->resv);
> +


>       scoped_guard(rwsem_read, &vdev->memory_lock) {
> +             /* Revocation status must be re-read, under memory_lock */
>               if (!priv->revoked) {
>                       int pres = vfio_pci_dma_buf_find_pfn(priv, vma,
>                                                            vmf->address,

Wait, I noticed that the is_aligned_for_order() check from mainline was 
removed here. Was that intentional? 

For hugepage faults (order > 0), we must ensure the PFN and address are
properly aligned before calling vfio_pci_vmf_insert_pfn().

In the current upstream code, we have:
  if (is_aligned_for_order(vma, addr, pfn, order))

Should we restore that check here?

> @@ -1766,6 +1827,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct 
> vm_fault *vmf,
>                           __func__, order, pfn, vmf->address,
>                           vma->vm_pgoff, (unsigned int)ret);
>  
> +     vfio_device_put_registration(&vdev->vdev);
>       return ret;
>  }
>  
> @@ -1774,7 +1836,7 @@ static vm_fault_t vfio_pci_mmap_page_fault(struct 
> vm_fault *vmf)
>       return vfio_pci_mmap_huge_fault(vmf, 0);
>  }
>  
> -static const struct vm_operations_struct vfio_pci_mmap_ops = {
> +const struct vm_operations_struct vfio_pci_mmap_ops = {
>       .fault = vfio_pci_mmap_page_fault,

Nit: Instead of making this global, should we add a helper? E.g.:

void vfio_pci_set_vma_ops(struct vm_area_struct *vma)
{
     vma->vm_ops = &vfio_pci_mmap_ops;
}

[...]

> +
> +static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct 
> vm_area_struct *vma)
> +{
> +     struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +     /*
> +      * If we observe that the buffer is revoked now then refuse
> +      * the mmap().  This is a belt-and-braces early failure to
> +      * ease debugging a revoked buffer being used.  Userspace
> +      * might also race an mmap() against an explicit revocation,
> +      * or an action doing a temporary revoke; race scenarios are
> +      * still safe because the fault handler ultimately prevents
> +      * access to a revoked buffer if it isn't caught here.
> +      */
> +     if (READ_ONCE(priv->revoked))
> +             return -ENODEV;
> +     if ((vma->vm_flags & VM_SHARED) == 0)
> +             return -EINVAL;
> +
> +     /*
> +      * dma_buf_mmap_internal() has asserted that the VMA is
> +      * contained within the DMABUF size before calling this.
> +      */
> +
> +     vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +     vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
> +
> +     /* See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED. */
> +     vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
> +                  VM_DONTEXPAND | VM_DONTDUMP);
> +     vma->vm_private_data = priv;
> +     vma->vm_ops = &vfio_pci_mmap_ops;
> +
> +     return 0;
> +}
>  #endif /* CONFIG_VFIO_PCI_DMABUF */
>  

Thanks,
Praan

Reply via email to