On Mon, 4 Aug 2025 16:00:45 +0300 Leon Romanovsky <l...@kernel.org> wrote:
> From: Leon Romanovsky <leo...@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. > > The implementation provides a revocable attachment mechanism using > dma-buf move operations. MMIO regions are normally pinned as BARs > don't change physical addresses, but access is revoked when the VFIO > device is closed or a PCI reset is issued. This ensures kernel > self-defense against potentially hostile userspace. > > Signed-off-by: Jason Gunthorpe <j...@nvidia.com> > Signed-off-by: Vivek Kasireddy <vivek.kasire...@intel.com> > Signed-off-by: Leon Romanovsky <leo...@nvidia.com> > --- > drivers/vfio/pci/Kconfig | 20 ++ > drivers/vfio/pci/Makefile | 2 + > drivers/vfio/pci/vfio_pci_config.c | 22 +- > drivers/vfio/pci/vfio_pci_core.c | 25 +- > drivers/vfio/pci/vfio_pci_dmabuf.c | 390 +++++++++++++++++++++++++++++ > drivers/vfio/pci/vfio_pci_priv.h | 23 ++ > include/linux/dma-buf.h | 1 + > include/linux/vfio_pci_core.h | 3 + > include/uapi/linux/vfio.h | 25 ++ > 9 files changed, 506 insertions(+), 5 deletions(-) > create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c > > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 2b0172f546652..55ae888bf26ae 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM > > To enable s390x KVM vfio-pci extensions, say Y. > > +config VFIO_PCI_DMABUF > + bool "VFIO PCI extensions for DMA-BUF" > + depends on VFIO_PCI_CORE > + depends on PCI_P2PDMA && DMA_SHARED_BUFFER > + default y > + help > + Enable support for VFIO PCI extensions that allow exporting > + device MMIO regions as DMA-BUFs for peer devices to access via > + peer-to-peer (P2P) DMA. > + > + This feature enables a VFIO-managed PCI device to export a portion > + of its MMIO BAR as a DMA-BUF file descriptor, which can be passed > + to other userspace drivers or kernel subsystems capable of > + initiating DMA to that region. > + > + Say Y here if you want to enable VFIO DMABUF-based MMIO export > + support for peer-to-peer DMA use cases. > + > + If unsure, say N. > + > source "drivers/vfio/pci/mlx5/Kconfig" > > source "drivers/vfio/pci/hisilicon/Kconfig" > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index cf00c0a7e55c8..f9155e9c5f630 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -2,7 +2,9 @@ > > vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o > vfio_pci_config.o > vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o > + > obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o > +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o > > vfio-pci-y := vfio_pci.o > vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o > diff --git a/drivers/vfio/pci/vfio_pci_config.c > b/drivers/vfio/pci/vfio_pci_config.c > index 8f02f236b5b4b..7e23387a43b4d 100644 > --- a/drivers/vfio/pci/vfio_pci_config.c > +++ b/drivers/vfio/pci/vfio_pci_config.c > @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct > vfio_pci_core_device *vdev, int pos, > virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); > new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); > > - if (!new_mem) > + if (!new_mem) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > - else > + vfio_pci_dma_buf_move(vdev, true); > + } else { > down_write(&vdev->memory_lock); > + } > > /* > * If the user is writing mem/io enable (new_mem/io) and we > @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct > vfio_pci_core_device *vdev, int pos, > *virt_cmd &= cpu_to_le16(~mask); > *virt_cmd |= cpu_to_le16(new_cmd & mask); > > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > > @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct > perm_bits *perm) > static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, > pci_power_t state) > { > - if (state >= PCI_D3hot) > + if (state >= PCI_D3hot) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > - else > + vfio_pci_dma_buf_move(vdev, true); > + } else { > down_write(&vdev->memory_lock); > + } > > vfio_pci_set_power_state(vdev, state); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > > @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct > vfio_pci_core_device *vdev, int pos, > > if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > + vfio_pci_dma_buf_move(vdev, true); > pci_try_reset_function(vdev->pdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, true); @revoked true -> true seems wrong. > up_write(&vdev->memory_lock); > } > } > @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct > vfio_pci_core_device *vdev, int pos, > > if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > + vfio_pci_dma_buf_move(vdev, true); > pci_try_reset_function(vdev->pdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, true); Same. > up_write(&vdev->memory_lock); > } > } > diff --git a/drivers/vfio/pci/vfio_pci_core.c > b/drivers/vfio/pci/vfio_pci_core.c > index b1863d84b11aa..8e840ac413e9b 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -28,7 +28,9 @@ > #include <linux/nospec.h> > #include <linux/sched/mm.h> > #include <linux/iommufd.h> > +#ifdef CONFIG_VFIO_PCI_DMABUF > #include <linux/pci-p2pdma.h> > +#endif > #if IS_ENABLED(CONFIG_EEH) > #include <asm/eeh.h> > #endif > @@ -287,6 +289,8 @@ static int vfio_pci_runtime_pm_entry(struct > vfio_pci_core_device *vdev, > * semaphore. > */ > vfio_pci_zap_and_down_write_memory_lock(vdev); > + vfio_pci_dma_buf_move(vdev, true); > + > if (vdev->pm_runtime_engaged) { > up_write(&vdev->memory_lock); > return -EINVAL; > @@ -370,6 +374,8 @@ static void vfio_pci_runtime_pm_exit(struct > vfio_pci_core_device *vdev) > */ > down_write(&vdev->memory_lock); > __vfio_pci_runtime_pm_exit(vdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > > @@ -690,6 +696,8 @@ void vfio_pci_core_close_device(struct vfio_device > *core_vdev) > #endif > vfio_pci_core_disable(vdev); > > + vfio_pci_dma_buf_cleanup(vdev); > + > mutex_lock(&vdev->igate); > if (vdev->err_trigger) { > eventfd_ctx_put(vdev->err_trigger); > @@ -1222,7 +1230,10 @@ static int vfio_pci_ioctl_reset(struct > vfio_pci_core_device *vdev, > */ > vfio_pci_set_power_state(vdev, PCI_D0); > > + vfio_pci_dma_buf_move(vdev, true); > ret = pci_try_reset_function(vdev->pdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > > return ret; > @@ -1511,6 +1522,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device > *device, u32 flags, > return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); > case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: > return vfio_pci_core_feature_token(vdev, flags, arg, argsz); > + case VFIO_DEVICE_FEATURE_DMA_BUF: > + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); > default: > return -ENOTTY; > } > @@ -2085,9 +2098,13 @@ int vfio_pci_core_init_dev(struct vfio_device > *core_vdev) > INIT_LIST_HEAD(&vdev->dummy_resources_list); > INIT_LIST_HEAD(&vdev->ioeventfds_list); > INIT_LIST_HEAD(&vdev->sriov_pfs_item); > +#ifdef CONFIG_VFIO_PCI_DMABUF > vdev->provider = pci_p2pdma_enable(vdev->pdev); > if (IS_ERR(vdev->provider)) > return PTR_ERR(vdev->provider); > + > + INIT_LIST_HEAD(&vdev->dmabufs); > +#endif > init_rwsem(&vdev->memory_lock); > xa_init(&vdev->ctx); > > @@ -2470,11 +2487,17 @@ static int vfio_pci_dev_set_hot_reset(struct > vfio_device_set *dev_set, > * cause the PCI config space reset without restoring the original > * state (saved locally in 'vdev->pm_save'). > */ > - list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) > + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { > + vfio_pci_dma_buf_move(vdev, true); > vfio_pci_set_power_state(vdev, PCI_D0); > + } The revoke should have happened at the time the BARs were zapped. Thanks, Alex > > ret = pci_reset_bus(pdev); > > + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > + > vdev = list_last_entry(&dev_set->device_list, > struct vfio_pci_core_device, vdev.dev_set_list); >