On Mon,  4 Aug 2025 16:00:45 +0300
Leon Romanovsky <l...@kernel.org> wrote:

> From: Leon Romanovsky <leo...@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.
> 
> Signed-off-by: Jason Gunthorpe <j...@nvidia.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasire...@intel.com>
> Signed-off-by: Leon Romanovsky <leo...@nvidia.com>
> ---
>  drivers/vfio/pci/Kconfig           |  20 ++
>  drivers/vfio/pci/Makefile          |   2 +
>  drivers/vfio/pci/vfio_pci_config.c |  22 +-
>  drivers/vfio/pci/vfio_pci_core.c   |  25 +-
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 390 +++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_priv.h   |  23 ++
>  include/linux/dma-buf.h            |   1 +
>  include/linux/vfio_pci_core.h      |   3 +
>  include/uapi/linux/vfio.h          |  25 ++
>  9 files changed, 506 insertions(+), 5 deletions(-)
>  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f546652..55ae888bf26ae 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM
>  
>         To enable s390x KVM vfio-pci extensions, say Y.
>  
> +config VFIO_PCI_DMABUF
> +     bool "VFIO PCI extensions for DMA-BUF"
> +     depends on VFIO_PCI_CORE
> +     depends on PCI_P2PDMA && DMA_SHARED_BUFFER
> +     default y
> +     help
> +       Enable support for VFIO PCI extensions that allow exporting
> +       device MMIO regions as DMA-BUFs for peer devices to access via
> +       peer-to-peer (P2P) DMA.
> +
> +       This feature enables a VFIO-managed PCI device to export a portion
> +       of its MMIO BAR as a DMA-BUF file descriptor, which can be passed
> +       to other userspace drivers or kernel subsystems capable of
> +       initiating DMA to that region.
> +
> +       Say Y here if you want to enable VFIO DMABUF-based MMIO export
> +       support for peer-to-peer DMA use cases.
> +
> +       If unsure, say N.
> +
>  source "drivers/vfio/pci/mlx5/Kconfig"
>  
>  source "drivers/vfio/pci/hisilicon/Kconfig"
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c8..f9155e9c5f630 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -2,7 +2,9 @@
>  
>  vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o 
> vfio_pci_config.o
>  vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
> +
>  obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
> +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
>  
>  vfio-pci-y := vfio_pci.o
>  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> diff --git a/drivers/vfio/pci/vfio_pci_config.c 
> b/drivers/vfio/pci/vfio_pci_config.c
> index 8f02f236b5b4b..7e23387a43b4d 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct 
> vfio_pci_core_device *vdev, int pos,
>               virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
>               new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
>  
> -             if (!new_mem)
> +             if (!new_mem) {
>                       vfio_pci_zap_and_down_write_memory_lock(vdev);
> -             else
> +                     vfio_pci_dma_buf_move(vdev, true);
> +             } else {
>                       down_write(&vdev->memory_lock);
> +             }
>  
>               /*
>                * If the user is writing mem/io enable (new_mem/io) and we
> @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct 
> vfio_pci_core_device *vdev, int pos,
>               *virt_cmd &= cpu_to_le16(~mask);
>               *virt_cmd |= cpu_to_le16(new_cmd & mask);
>  
> +             if (__vfio_pci_memory_enabled(vdev))
> +                     vfio_pci_dma_buf_move(vdev, false);
>               up_write(&vdev->memory_lock);
>       }
>  
> @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct 
> perm_bits *perm)
>  static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
>                                         pci_power_t state)
>  {
> -     if (state >= PCI_D3hot)
> +     if (state >= PCI_D3hot) {
>               vfio_pci_zap_and_down_write_memory_lock(vdev);
> -     else
> +             vfio_pci_dma_buf_move(vdev, true);
> +     } else {
>               down_write(&vdev->memory_lock);
> +     }
>  
>       vfio_pci_set_power_state(vdev, state);
> +     if (__vfio_pci_memory_enabled(vdev))
> +             vfio_pci_dma_buf_move(vdev, false);
>       up_write(&vdev->memory_lock);
>  }
>  
> @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct 
> vfio_pci_core_device *vdev, int pos,
>  
>               if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
>                       vfio_pci_zap_and_down_write_memory_lock(vdev);
> +                     vfio_pci_dma_buf_move(vdev, true);
>                       pci_try_reset_function(vdev->pdev);
> +                     if (__vfio_pci_memory_enabled(vdev))
> +                             vfio_pci_dma_buf_move(vdev, true);

@revoked true -> true seems wrong.

>                       up_write(&vdev->memory_lock);
>               }
>       }
> @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct 
> vfio_pci_core_device *vdev, int pos,
>  
>               if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
>                       vfio_pci_zap_and_down_write_memory_lock(vdev);
> +                     vfio_pci_dma_buf_move(vdev, true);
>                       pci_try_reset_function(vdev->pdev);
> +                     if (__vfio_pci_memory_enabled(vdev))
> +                             vfio_pci_dma_buf_move(vdev, true);

Same.

>                       up_write(&vdev->memory_lock);
>               }
>       }
> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
> b/drivers/vfio/pci/vfio_pci_core.c
> index b1863d84b11aa..8e840ac413e9b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -28,7 +28,9 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>  #include <linux/pci-p2pdma.h>
> +#endif
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -287,6 +289,8 @@ static int vfio_pci_runtime_pm_entry(struct 
> vfio_pci_core_device *vdev,
>        * semaphore.
>        */
>       vfio_pci_zap_and_down_write_memory_lock(vdev);
> +     vfio_pci_dma_buf_move(vdev, true);
> +
>       if (vdev->pm_runtime_engaged) {
>               up_write(&vdev->memory_lock);
>               return -EINVAL;
> @@ -370,6 +374,8 @@ static void vfio_pci_runtime_pm_exit(struct 
> vfio_pci_core_device *vdev)
>        */
>       down_write(&vdev->memory_lock);
>       __vfio_pci_runtime_pm_exit(vdev);
> +     if (__vfio_pci_memory_enabled(vdev))
> +             vfio_pci_dma_buf_move(vdev, false);
>       up_write(&vdev->memory_lock);
>  }
>  
> @@ -690,6 +696,8 @@ void vfio_pci_core_close_device(struct vfio_device 
> *core_vdev)
>  #endif
>       vfio_pci_core_disable(vdev);
>  
> +     vfio_pci_dma_buf_cleanup(vdev);
> +
>       mutex_lock(&vdev->igate);
>       if (vdev->err_trigger) {
>               eventfd_ctx_put(vdev->err_trigger);
> @@ -1222,7 +1230,10 @@ static int vfio_pci_ioctl_reset(struct 
> vfio_pci_core_device *vdev,
>        */
>       vfio_pci_set_power_state(vdev, PCI_D0);
>  
> +     vfio_pci_dma_buf_move(vdev, true);
>       ret = pci_try_reset_function(vdev->pdev);
> +     if (__vfio_pci_memory_enabled(vdev))
> +             vfio_pci_dma_buf_move(vdev, false);
>       up_write(&vdev->memory_lock);
>  
>       return ret;
> @@ -1511,6 +1522,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device 
> *device, u32 flags,
>               return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>       case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>               return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +     case VFIO_DEVICE_FEATURE_DMA_BUF:
> +             return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>       default:
>               return -ENOTTY;
>       }
> @@ -2085,9 +2098,13 @@ int vfio_pci_core_init_dev(struct vfio_device 
> *core_vdev)
>       INIT_LIST_HEAD(&vdev->dummy_resources_list);
>       INIT_LIST_HEAD(&vdev->ioeventfds_list);
>       INIT_LIST_HEAD(&vdev->sriov_pfs_item);
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>       vdev->provider = pci_p2pdma_enable(vdev->pdev);
>       if (IS_ERR(vdev->provider))
>               return PTR_ERR(vdev->provider);
> +
> +     INIT_LIST_HEAD(&vdev->dmabufs);
> +#endif
>       init_rwsem(&vdev->memory_lock);
>       xa_init(&vdev->ctx);
>  
> @@ -2470,11 +2487,17 @@ static int vfio_pci_dev_set_hot_reset(struct 
> vfio_device_set *dev_set,
>        * cause the PCI config space reset without restoring the original
>        * state (saved locally in 'vdev->pm_save').
>        */
> -     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
> +             vfio_pci_dma_buf_move(vdev, true);
>               vfio_pci_set_power_state(vdev, PCI_D0);
> +     }

The revoke should have happened at the time the BARs were zapped.
Thanks,

Alex

>  
>       ret = pci_reset_bus(pdev);
>  
> +     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +             if (__vfio_pci_memory_enabled(vdev))
> +                     vfio_pci_dma_buf_move(vdev, false);
> +
>       vdev = list_last_entry(&dev_set->device_list,
>                              struct vfio_pci_core_device, vdev.dev_set_list);
>  

Reply via email to