On Wed, 27 May 2026 03:23:07 -0700 Matt Evans <[email protected]> wrote:
> Convert the VFIO device fd fops->mmap to create a DMABUF representing > the BAR mapping, and make the VMA fault handler look up PFNs from the > corresponding DMABUF. This supports future code mmap()ing BAR > DMABUFs, and iommufd work to support Type1 P2P. > > First, vfio_pci_core_mmap() uses the new > vfio_pci_core_mmap_prep_dmabuf() helper to export a DMABUF > representing a single BAR range. Then, the vfio_pci_mmap_huge_fault() > callback is updated to understand revoked buffers, and uses the new > vfio_pci_dma_buf_find_pfn() helper to determine the PFN for a given > fault address. > > Now that the VFIO DMABUFs can be mmap()ed, vfio_pci_dma_buf_move() > zaps PTEs (used on the revocation and cleanup paths). > > CONFIG_VFIO_PCI_CORE now unconditionally depends on > CONFIG_DMA_SHARED_BUFFER and CONFIG_PCI_P2PDMA_CORE. The > CONFIG_VFIO_PCI_DMABUF feature conditionally includes support for > VFIO_DEVICE_FEATURE_DMA_BUF, depending on the availability of > CONFIG_PCI_P2PDMA. > > Signed-off-by: Matt Evans <[email protected]> > --- > drivers/vfio/pci/Kconfig | 4 +- > drivers/vfio/pci/Makefile | 3 +- > drivers/vfio/pci/vfio_pci_core.c | 79 +++++++++++++++++++----------- > drivers/vfio/pci/vfio_pci_dmabuf.c | 12 +++++ > drivers/vfio/pci/vfio_pci_priv.h | 11 +---- > 5 files changed, 68 insertions(+), 41 deletions(-) > > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 296bf01e185e..9197343a7301 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -6,6 +6,8 @@ config VFIO_PCI_CORE > tristate > select VFIO_VIRQFD > select IRQ_BYPASS_MANAGER > + select PCI_P2PDMA_CORE > + select DMA_SHARED_BUFFER > > config VFIO_PCI_INTX > def_bool y if !S390 > @@ -56,7 +58,7 @@ config VFIO_PCI_ZDEV_KVM > To enable s390x KVM vfio-pci extensions, say Y. > > config VFIO_PCI_DMABUF > - def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER > + def_bool y if PCI_P2PDMA This largely only breaks consistency, but should VFIO_PCI_CORE become a 'depends on' rather than dropped entirely? > > source "drivers/vfio/pci/mlx5/Kconfig" > > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index 6138f1bf241d..881452ea89be 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -1,8 +1,7 @@ > # SPDX-License-Identifier: GPL-2.0-only > > -vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o > vfio_pci_config.o > +vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o > vfio_pci_config.o vfio_pci_dmabuf.o > vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o > -vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o > obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o > > vfio-pci-y := vfio_pci.o > diff --git a/drivers/vfio/pci/vfio_pci_core.c > b/drivers/vfio/pci/vfio_pci_core.c > index 041243a84d81..c5f934905ce0 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -1683,18 +1683,6 @@ void vfio_pci_memory_unlock_and_restore(struct > vfio_pci_core_device *vdev, u16 c > up_write(&vdev->memory_lock); > } > > -static unsigned long vma_to_pfn(struct vm_area_struct *vma) > -{ > - struct vfio_pci_core_device *vdev = vma->vm_private_data; > - int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); > - u64 pgoff; > - > - pgoff = vma->vm_pgoff & > - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > - > - return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; > -} > - > vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, > struct vm_fault *vmf, > unsigned long pfn, > @@ -1722,23 +1710,42 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct > vm_fault *vmf, > unsigned int order) > { > struct vm_area_struct *vma = vmf->vma; > - struct vfio_pci_core_device *vdev = vma->vm_private_data; > - unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); > - unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; > - unsigned long pfn = vma_to_pfn(vma) + pgoff; > - vm_fault_t ret = VM_FAULT_FALLBACK; > - > - if (is_aligned_for_order(vma, addr, pfn, order)) { > - scoped_guard(rwsem_read, &vdev->memory_lock) > - ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); > - } > + struct vfio_pci_dma_buf *priv = vma->vm_private_data; > + struct vfio_pci_core_device *vdev; > + unsigned long pfn = 0; > + vm_fault_t ret = VM_FAULT_SIGBUS; > > - dev_dbg_ratelimited(&vdev->pdev->dev, > - "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", > - __func__, order, > - vma->vm_pgoff >> > - (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), > - pgoff, (unsigned int)ret); > + /* > + * We can rely on the existence of both a DMABUF (priv) and > + * the VFIO device it was exported from (vdev). This fault's > + * VMA was established using vfio_pci_core_mmap_prep_dmabuf() > + * which transfers ownership of the VFIO device fd to the > + * DMABUF, and so the VFIO device is held open because the > + * VMA's vm_file (DMABUF) is open. > + * > + * Since vfio_pci_dma_buf_cleanup() cannot have happened, > + * vdev must be valid; we can take memory_lock. > + */ > + vdev = READ_ONCE(priv->vdev); The above comment argues that vdev is stable, so why do we need to access it with READ_ONCE()? > + > + scoped_guard(rwsem_read, &vdev->memory_lock) { > + if (!priv->revoked) { > + int pres = vfio_pci_dma_buf_find_pfn(priv, vma, > + vmf->address, > + order, &pfn); > + > + if (pres == 0) > + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, > + pfn, order); > + else if (pres == -EAGAIN) > + ret = VM_FAULT_FALLBACK; > + } > + > + dev_dbg_ratelimited(&vdev->pdev->dev, > + "%s(order = %d) PFN 0x%lx, VA 0x%lx, pgoff > 0x%lx: 0x%x\n", > + __func__, order, pfn, vmf->address, > + vma->vm_pgoff, (unsigned int)ret); Looks like this should still be outside the scope of the memory_lock. Thanks, Alex > + } > > return ret; > } > @@ -1763,6 +1770,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, > struct vm_area_struct *vma > unsigned int index; > u64 phys_len, req_len, pgoff, req_start; > void __iomem *bar_io; > + int ret; > > index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); > > @@ -1802,7 +1810,20 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, > struct vm_area_struct *vma > if (IS_ERR(bar_io)) > return PTR_ERR(bar_io); > > - vma->vm_private_data = vdev; > + /* > + * Create a DMABUF with a single range corresponding to this > + * mapping, and wire it into vma->vm_private_data. The VMA's > + * vm_file becomes that of the DMABUF, and the DMABUF takes > + * ownership of the VFIO device file (put upon DMABUF > + * release). This maintains the behaviour of a live VMA > + * mapping holding the VFIO device file open. > + */ > + ret = vfio_pci_core_mmap_prep_dmabuf(vdev, vma, > + pci_resource_start(pdev, index), > + req_len, index); > + if (ret) > + return ret; > + > vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); > > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c > b/drivers/vfio/pci/vfio_pci_dmabuf.c > index 782408c08a5e..f7797f58d44b 100644 > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c > @@ -9,6 +9,7 @@ > > MODULE_IMPORT_NS("DMA_BUF"); > > +#ifdef CONFIG_VFIO_PCI_DMABUF > static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, > struct dma_buf_attachment *attachment) > { > @@ -25,6 +26,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, > > return 0; > } > +#endif /* CONFIG_VFIO_PCI_DMABUF */ > > static void vfio_pci_dma_buf_done(struct kref *kref) > { > @@ -89,7 +91,9 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) > } > > static const struct dma_buf_ops vfio_pci_dmabuf_ops = { > +#ifdef CONFIG_VFIO_PCI_DMABUF > .attach = vfio_pci_dma_buf_attach, > +#endif > .map_dma_buf = vfio_pci_dma_buf_map, > .unmap_dma_buf = vfio_pci_dma_buf_unmap, > .release = vfio_pci_dma_buf_release, > @@ -263,6 +267,7 @@ static int vfio_pci_dmabuf_export(struct > vfio_pci_core_device *vdev, > return 0; > } > > +#ifdef CONFIG_VFIO_PCI_DMABUF > /* > * This is a temporary "private interconnect" between VFIO DMABUF and > iommufd. > * It allows the two co-operating drivers to exchange the physical address of > @@ -461,6 +466,7 @@ int vfio_pci_core_feature_dma_buf(struct > vfio_pci_core_device *vdev, u32 flags, > kfree(dma_ranges); > return ret; > } > +#endif /* CONFIG_VFIO_PCI_DMABUF */ > > int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev, > struct vm_area_struct *vma, > @@ -535,6 +541,10 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device > *vdev, bool revoked) > struct vfio_pci_dma_buf *tmp; > > lockdep_assert_held_write(&vdev->memory_lock); > + /* > + * Holding memory_lock ensures a racing VMA fault observes > + * priv->revoked properly. > + */ > > list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { > if (!get_file_active(&priv->dmabuf->file)) > @@ -552,6 +562,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device > *vdev, bool revoked) > if (revoked) { > kref_put(&priv->kref, vfio_pci_dma_buf_done); > wait_for_completion(&priv->comp); > + > unmap_mapping_range(priv->dmabuf->file->f_mapping, > + 0, priv->size, 1); > /* > * Re-arm the registered kref reference and the > * completion so the post-revoke state matches > the > diff --git a/drivers/vfio/pci/vfio_pci_priv.h > b/drivers/vfio/pci/vfio_pci_priv.h > index 06dc0fd3e230..d38e1b98b2e9 100644 > --- a/drivers/vfio/pci/vfio_pci_priv.h > +++ b/drivers/vfio/pci/vfio_pci_priv.h > @@ -138,13 +138,13 @@ int vfio_pci_core_mmap_prep_dmabuf(struct > vfio_pci_core_device *vdev, > struct vm_area_struct *vma, > u64 phys_start, u64 req_len, > unsigned int res_index); > +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); > > #ifdef CONFIG_VFIO_PCI_DMABUF > int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 > flags, > struct vfio_device_feature_dma_buf __user > *arg, > size_t argsz); > -void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); > -void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); > #else > static inline int > vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, > @@ -153,13 +153,6 @@ vfio_pci_core_feature_dma_buf(struct > vfio_pci_core_device *vdev, u32 flags, > { > return -ENOTTY; > } > -static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device > *vdev) > -{ > -} > -static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, > - bool revoked) > -{ > -} > #endif > > #endif
