On Thu, 12 Mar 2026 11:46:02 -0700
Matt Evans <[email protected]> wrote:

> This helper, vfio_pci_core_mmap_prep_dmabuf(), creates a single-range
> DMABUF for the purpose of mapping a PCI BAR.  This is used in a future
> commit by VFIO's ordinary mmap() path.
> 
> This function transfers ownership of the VFIO device fd to the
> DMABUF, which fput()s when it's released.
> 
> Refactor the existing vfio_pci_core_feature_dma_buf() to split out
> export code common to the two paths, VFIO_DEVICE_FEATURE_DMA_BUF and
> this new VFIO_BAR mmap().
> 
> Signed-off-by: Matt Evans <[email protected]>
> ---
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 131 +++++++++++++++++++++--------
>  drivers/vfio/pci/vfio_pci_priv.h   |   4 +
>  2 files changed, 102 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c 
> b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index 63140528dbea..76db340ba592 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -82,6 +82,8 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
>               up_write(&priv->vdev->memory_lock);
>               vfio_device_put_registration(&priv->vdev->vdev);
>       }
> +     if (priv->vfile)
> +             fput(priv->vfile);
>       kfree(priv->phys_vec);
>       kfree(priv);
>  }
> @@ -182,6 +184,41 @@ int vfio_pci_dma_buf_find_pfn(struct vfio_pci_dma_buf 
> *vpdmabuf,
>       return -EFAULT;
>  }
>  
> +static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev,
> +                               struct vfio_pci_dma_buf *priv, uint32_t flags,
> +                               size_t size, bool status_ok)
> +{
> +     DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> +
> +     if (!vfio_device_try_get_registration(&vdev->vdev))
> +             return -ENODEV;
> +
> +     exp_info.ops = &vfio_pci_dmabuf_ops;
> +     exp_info.size = size;
> +     exp_info.flags = flags;
> +     exp_info.priv = priv;
> +
> +     priv->dmabuf = dma_buf_export(&exp_info);
> +     if (IS_ERR(priv->dmabuf)) {
> +             vfio_device_put_registration(&vdev->vdev);
> +             return PTR_ERR(priv->dmabuf);
> +     }
> +
> +     kref_init(&priv->kref);
> +     init_completion(&priv->comp);
> +
> +     /* dma_buf_put() now frees priv */
> +     INIT_LIST_HEAD(&priv->dmabufs_elm);
> +     down_write(&vdev->memory_lock);
> +     dma_resv_lock(priv->dmabuf->resv, NULL);
> +     priv->revoked = !status_ok;

Testing __vfio_pci_memory_enabled() outside of memory_lock() is
invalid, so passing it as a parameter outside of the semaphore is
invalid.  @status_ok is stale here.

> +     list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> +     dma_resv_unlock(priv->dmabuf->resv);
> +     up_write(&vdev->memory_lock);
> +
> +     return 0;
> +}
> +
>  /*
>   * This is a temporary "private interconnect" between VFIO DMABUF and 
> iommufd.
>   * It allows the two co-operating drivers to exchange the physical address of
> @@ -300,7 +337,6 @@ int vfio_pci_core_feature_dma_buf(struct 
> vfio_pci_core_device *vdev, u32 flags,
>  {
>       struct vfio_device_feature_dma_buf get_dma_buf = {};
>       struct vfio_region_dma_range *dma_ranges;
> -     DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
>       struct vfio_pci_dma_buf *priv;
>       size_t length;
>       int ret;
> @@ -369,46 +405,20 @@ int vfio_pci_core_feature_dma_buf(struct 
> vfio_pci_core_device *vdev, u32 flags,
>       kfree(dma_ranges);
>       dma_ranges = NULL;
>  
> -     if (!vfio_device_try_get_registration(&vdev->vdev)) {
> -             ret = -ENODEV;
> +     ret = vfio_pci_dmabuf_export(vdev, priv, get_dma_buf.open_flags,
> +                                  priv->size,
> +                                  __vfio_pci_memory_enabled(vdev));
> +     if (ret)
>               goto err_free_phys;
> -     }
> -
> -     exp_info.ops = &vfio_pci_dmabuf_ops;
> -     exp_info.size = priv->size;
> -     exp_info.flags = get_dma_buf.open_flags;
> -     exp_info.priv = priv;
> -
> -     priv->dmabuf = dma_buf_export(&exp_info);
> -     if (IS_ERR(priv->dmabuf)) {
> -             ret = PTR_ERR(priv->dmabuf);
> -             goto err_dev_put;
> -     }
> -
> -     kref_init(&priv->kref);
> -     init_completion(&priv->comp);
> -
> -     /* dma_buf_put() now frees priv */
> -     INIT_LIST_HEAD(&priv->dmabufs_elm);
> -     down_write(&vdev->memory_lock);
> -     dma_resv_lock(priv->dmabuf->resv, NULL);
> -     priv->revoked = !__vfio_pci_memory_enabled(vdev);

Tested under memory_lock.  It was correct previously.

> -     list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> -     dma_resv_unlock(priv->dmabuf->resv);
> -     up_write(&vdev->memory_lock);
> -
>       /*
>        * dma_buf_fd() consumes the reference, when the file closes the dmabuf
>        * will be released.
>        */
>       ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
> -     if (ret < 0)
> -             goto err_dma_buf;
> -     return ret;
> +     if (ret >= 0)
> +             return ret;
>  
> -err_dma_buf:
>       dma_buf_put(priv->dmabuf);
> -err_dev_put:
>       vfio_device_put_registration(&vdev->vdev);
>  err_free_phys:
>       kfree(priv->phys_vec);
> @@ -419,6 +429,61 @@ int vfio_pci_core_feature_dma_buf(struct 
> vfio_pci_core_device *vdev, u32 flags,
>       return ret;
>  }
>  
> +int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev,
> +                                struct vm_area_struct *vma,
> +                                u64 phys_start,
> +                                u64 pgoff,
> +                                u64 req_len)
> +{
> +     struct vfio_pci_dma_buf *priv;
> +     const unsigned int nr_ranges = 1;
> +     int ret;
> +
> +     priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +     if (!priv)
> +             return -ENOMEM;
> +
> +     priv->phys_vec = kcalloc(nr_ranges, sizeof(*priv->phys_vec),
> +                              GFP_KERNEL);
> +     if (!priv->phys_vec) {
> +             ret = -ENOMEM;
> +             goto err_free_priv;
> +     }
> +
> +     priv->vdev = vdev;
> +     priv->nr_ranges = nr_ranges;
> +     priv->size = req_len;
> +     priv->phys_vec[0].paddr = phys_start + (pgoff << PAGE_SHIFT);
> +     priv->phys_vec[0].len = req_len;
> +
> +     /*
> +      * Creates a DMABUF, adds it to vdev->dmabufs list for
> +      * tracking (meaning cleanup or revocation will zap them), and
> +      * registers with vfio_device:
> +      */
> +     ret = vfio_pci_dmabuf_export(vdev, priv, O_CLOEXEC, priv->size, true);
> +     if (ret)
> +             goto err_free_phys;
> +
> +     /*
> +      * The VMA gets the DMABUF file so that other users can locate
> +      * the DMABUF via a VA.  Ownership of the original VFIO device
> +      * file being mmap()ed transfers to priv, and is put when the
> +      * DMABUF is released.
> +      */
> +     priv->vfile = vma->vm_file;
> +     vma->vm_file = priv->dmabuf->file;

AIUI, this affects what the user sees in /proc/<pid>/maps, right?
Previously a memory range could be clearly associated with a specific
vfio device, now, only for vfio-pci devices, I think the range is
associated to a nondescript dmabuf.  If so, is that an acceptable, user
visible, debugging friendly change (ex. lsof)?  Thanks,

Alex

> +     vma->vm_private_data = priv;
> +
> +     return 0;
> +
> +err_free_phys:
> +     kfree(priv->phys_vec);
> +err_free_priv:
> +     kfree(priv);
> +     return ret;
> +}
> +
>  void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
>  {
>       struct vfio_pci_dma_buf *priv;
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h 
> b/drivers/vfio/pci/vfio_pci_priv.h
> index 5cc8c85a2153..5fd3a6e00a0e 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -30,6 +30,7 @@ struct vfio_pci_dma_buf {
>       size_t size;
>       struct phys_vec *phys_vec;
>       struct p2pdma_provider *provider;
> +     struct file *vfile;
>       u32 nr_ranges;
>       struct kref kref;
>       struct completion comp;
> @@ -128,6 +129,9 @@ int vfio_pci_dma_buf_find_pfn(struct vfio_pci_dma_buf 
> *vpdmabuf,
>                             unsigned long address,
>                             unsigned int order,
>                             unsigned long *out_pfn);
> +int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev,
> +                                struct vm_area_struct *vma,
> +                                u64 phys_start, u64 pgoff, u64 req_len);
>  
>  #ifdef CONFIG_VFIO_PCI_DMABUF
>  int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 
> flags,

Reply via email to