On Mon, 13 Oct 2025 18:26:11 +0300
Leon Romanovsky <[email protected]> wrote:
> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
> b/drivers/vfio/pci/vfio_pci_core.c
> index fe247d0e2831..56b1320238a9 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device 
> *device, u32 flags,
>               return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>       case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>               return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +     case VFIO_DEVICE_FEATURE_DMA_BUF:
> +             if (device->ops->ioctl != vfio_pci_core_ioctl)
> +                     /*
> +                      * Devices that overwrite general .ioctl() callback
> +                      * usually do it to implement their own
> +                      * VFIO_DEVICE_GET_REGION_INFO handlerm and they present

Typo, "handlerm"

> +                      * different BAR information from the real PCI.
> +                      *
> +                      * DMABUF relies on real PCI information.
> +                      */
> +                     return -EOPNOTSUPP;
> +
> +             return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>       default:
>               return -ENOTTY;
>       }
...
> @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct 
> vfio_device_set *dev_set,
>                       break;
>               }
>  
> +             vfio_pci_dma_buf_move(vdev, true);
>               vfio_pci_zap_bars(vdev);
>       }
>  
> @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct 
> vfio_device_set *dev_set,
>  
>       ret = pci_reset_bus(pdev);
>  
> +     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +             if (__vfio_pci_memory_enabled(vdev))
> +                     vfio_pci_dma_buf_move(vdev, false);
> +
>       vdev = list_last_entry(&dev_set->device_list,
>                              struct vfio_pci_core_device, vdev.dev_set_list);
>  

This needs to be placed in the existing undo loop with the up_write(),
otherwise it can be missed in the error case.

> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c 
> b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 000000000000..eaba010777f3
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
> +                               struct dma_iova_state *state)
> +{
> +     struct phys_vec *phys_vec = priv->phys_vec;
> +     unsigned int nents = 0;
> +     u32 i;
> +
> +     if (!state || !dma_use_iova(state))
> +             for (i = 0; i < priv->nr_ranges; i++)
> +                     nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
> +     else
> +             /*
> +              * In IOVA case, there is only one SG entry which spans
> +              * for whole IOVA address space, but we need to make sure
> +              * that it fits sg->length, maybe we need more.
> +              */
> +             nents = DIV_ROUND_UP(priv->size, UINT_MAX);

I think we're arguably running afoul of the coding style standard here
that this is not a single simple statement and should use braces.

> +
> +     return nents;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +                  enum dma_data_direction dir)
> +{
> +     struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +     struct dma_iova_state *state = attachment->priv;
> +     struct phys_vec *phys_vec = priv->phys_vec;
> +     unsigned long attrs = DMA_ATTR_MMIO;
> +     unsigned int nents, mapped_len = 0;
> +     struct scatterlist *sgl;
> +     struct sg_table *sgt;
> +     dma_addr_t addr;
> +     int ret;
> +     u32 i;
> +
> +     dma_resv_assert_held(priv->dmabuf->resv);
> +
> +     if (priv->revoked)
> +             return ERR_PTR(-ENODEV);
> +
> +     sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +     if (!sgt)
> +             return ERR_PTR(-ENOMEM);
> +
> +     nents = calc_sg_nents(priv, state);
> +     ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
> +     if (ret)
> +             goto err_kfree_sgt;
> +
> +     sgl = sgt->sgl;
> +
> +     for (i = 0; i < priv->nr_ranges; i++) {
> +             if (!state) {
> +                     addr = pci_p2pdma_bus_addr_map(priv->provider,
> +                                                    phys_vec[i].paddr);
> +             } else if (dma_use_iova(state)) {
> +                     ret = dma_iova_link(attachment->dev, state,
> +                                         phys_vec[i].paddr, 0,
> +                                         phys_vec[i].len, dir, attrs);
> +                     if (ret)
> +                             goto err_unmap_dma;
> +
> +                     mapped_len += phys_vec[i].len;
> +             } else {
> +                     addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
> +                                         phys_vec[i].len, dir, attrs);
> +                     ret = dma_mapping_error(attachment->dev, addr);
> +                     if (ret)
> +                             goto err_unmap_dma;
> +             }
> +
> +             if (!state || !dma_use_iova(state))
> +                     sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
> +     }
> +
> +     if (state && dma_use_iova(state)) {
> +             WARN_ON_ONCE(mapped_len != priv->size);
> +             ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
> +             if (ret)
> +                     goto err_unmap_dma;
> +             sgl = fill_sg_entry(sgl, mapped_len, state->addr);
> +     }
> +
> +     /*
> +      * SGL must be NULL to indicate that SGL is the last one
> +      * and we allocated correct number of entries in sg_alloc_table()
> +      */
> +     WARN_ON_ONCE(sgl);
> +     return sgt;
> +
> +err_unmap_dma:
> +     if (!i || !state)
> +             ; /* Do nothing */
> +     else if (dma_use_iova(state))
> +             dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> +                              attrs);
> +     else
> +             for_each_sgtable_dma_sg(sgt, sgl, i)
> +                     dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +                                     sg_dma_len(sgl), dir, attrs);

Same, here for braces.

> +     sg_free_table(sgt);
> +err_kfree_sgt:
> +     kfree(sgt);
> +     return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> +                                struct sg_table *sgt,
> +                                enum dma_data_direction dir)
> +{
> +     struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +     struct dma_iova_state *state = attachment->priv;
> +     unsigned long attrs = DMA_ATTR_MMIO;
> +     struct scatterlist *sgl;
> +     int i;
> +
> +     if (!state)
> +             ; /* Do nothing */
> +     else if (dma_use_iova(state))
> +             dma_iova_destroy(attachment->dev, state, priv->size, dir,
> +                              attrs);
> +     else
> +             for_each_sgtable_dma_sg(sgt, sgl, i)
> +                     dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +                                    sg_dma_len(sgl), dir, attrs);
> +

Here too.

> +     sg_free_table(sgt);
> +     kfree(sgt);
> +}
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 75100bf009ba..63214467c875 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf 
> from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the 
> dmabuf.
> + *

Probably worth noting that .flags should be zero, I see we enforce
that.  Thanks,

Alex

> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_region_dma_range {
> +     __u64 offset;
> +     __u64 length;
> +};
> +
> +struct vfio_device_feature_dma_buf {
> +     __u32   region_index;
> +     __u32   open_flags;
> +     __u32   flags;
> +     __u32   nr_ranges;
> +     struct vfio_region_dma_range dma_ranges[];
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**

Reply via email to