On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote:
> On 1/30/26 14:01, Leon Romanovsky wrote:
> > On Fri, Jan 30, 2026 at 09:30:59AM +0100, Christian König wrote:
> >> On 1/24/26 20:14, Leon Romanovsky wrote:
> >>> From: Leon Romanovsky <[email protected]>
> >>>
> >>> dma-buf invalidation is handled asynchronously by the hardware, so VFIO
> >>> must wait until all affected objects have been fully invalidated.
> >>>
> >>> In addition, the dma-buf exporter is expecting that all importers unmap 
> >>> any
> >>> buffers they previously mapped.
> >>>
> >>> Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO 
> >>> regions")
> >>> Signed-off-by: Leon Romanovsky <[email protected]>
> >>> ---
> >>>  drivers/vfio/pci/vfio_pci_dmabuf.c | 71 
> >>> ++++++++++++++++++++++++++++++++++++--
> >>>  1 file changed, 68 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c 
> >>> b/drivers/vfio/pci/vfio_pci_dmabuf.c
> >>> index d8ceafabef48..485515629fe4 100644
> >>> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> >>> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> >>> @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
> >>>   struct dma_buf_phys_vec *phys_vec;
> >>>   struct p2pdma_provider *provider;
> >>>   u32 nr_ranges;
> >>> + struct kref kref;
> >>> + struct completion comp;
> >>>   u8 revoked : 1;
> >>>  };
> >>>  
> >>> @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf 
> >>> *dmabuf,
> >>>   return 0;
> >>>  }
> >>>  
> >>> +static void vfio_pci_dma_buf_done(struct kref *kref)
> >>> +{
> >>> + struct vfio_pci_dma_buf *priv =
> >>> +         container_of(kref, struct vfio_pci_dma_buf, kref);
> >>> +
> >>> + complete(&priv->comp);
> >>> +}
> >>> +
> >>>  static struct sg_table *
> >>>  vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> >>>                enum dma_data_direction dir)
> >>>  {
> >>>   struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> >>> + struct sg_table *ret;
> >>>  
> >>>   dma_resv_assert_held(priv->dmabuf->resv);
> >>>  
> >>>   if (priv->revoked)
> >>>           return ERR_PTR(-ENODEV);
> >>>  
> >>> - return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> >>> -                                priv->phys_vec, priv->nr_ranges,
> >>> -                                priv->size, dir);
> >>> + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> >>> +                               priv->phys_vec, priv->nr_ranges,
> >>> +                               priv->size, dir);
> >>> + if (IS_ERR(ret))
> >>> +         return ret;
> >>> +
> >>> + kref_get(&priv->kref);
> >>> + return ret;
> >>>  }
> >>>  
> >>>  static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> >>>                              struct sg_table *sgt,
> >>>                              enum dma_data_direction dir)
> >>>  {
> >>> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> >>> +
> >>> + dma_resv_assert_held(priv->dmabuf->resv);
> >>> +
> >>>   dma_buf_free_sgt(attachment, sgt, dir);
> >>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> >>>  }
> >>>  
> >>>  static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> >>> @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct 
> >>> vfio_pci_core_device *vdev, u32 flags,
> >>>           goto err_dev_put;
> >>>   }
> >>>  
> >>> + kref_init(&priv->kref);
> >>> + init_completion(&priv->comp);
> >>> +
> >>>   /* dma_buf_put() now frees priv */
> >>>   INIT_LIST_HEAD(&priv->dmabufs_elm);
> >>>   down_write(&vdev->memory_lock);
> >>> @@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct 
> >>> vfio_pci_core_device *vdev, bool revoked)
> >>>   lockdep_assert_held_write(&vdev->memory_lock);
> >>>  
> >>>   list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> >>> +         unsigned long wait;
> >>> +
> >>>           if (!get_file_active(&priv->dmabuf->file))
> >>>                   continue;
> >>>  
> >>> @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct 
> >>> vfio_pci_core_device *vdev, bool revoked)
> >>>                   dma_resv_lock(priv->dmabuf->resv, NULL);
> >>>                   priv->revoked = revoked;
> >>>                   dma_buf_invalidate_mappings(priv->dmabuf);
> >>> +                 dma_resv_wait_timeout(priv->dmabuf->resv,
> >>> +                                       DMA_RESV_USAGE_BOOKKEEP, false,
> >>> +                                       MAX_SCHEDULE_TIMEOUT);
> >>>                   dma_resv_unlock(priv->dmabuf->resv);
> >>> +                 if (revoked) {
> >>> +                         kref_put(&priv->kref, vfio_pci_dma_buf_done);
> >>> +                         /* Let's wait till all DMA unmap are completed. 
> >>> */
> >>> +                         wait = wait_for_completion_timeout(
> >>> +                                 &priv->comp, secs_to_jiffies(1));
> >>> +                         /*
> >>> +                          * If you see this WARN_ON, it means that
> >>> +                          * importer didn't call unmap in response to
> >>> +                          * dma_buf_invalidate_mappings() which is not
> >>> +                          * allowed.
> >>> +                          */
> >>> +                         WARN(!wait,
> >>> +                              "Timed out waiting for DMABUF unmap, 
> >>> importer has a broken invalidate_mapping()");
> >>
> >> You can do the revoke to do your resource management, for example re-use 
> >> the backing store for something else.
> >>
> >> But it is mandatory that you keep the mapping around indefinitely until 
> >> the importer closes it.
> >>
> >> Before that you can't do things like runtime PM or remove or anything 
> >> which would make the DMA addresses invalid.
> >>
> >> As far as I can see vfio_pci_dma_buf_move() is used exactly for that use 
> >> case so this here is an absolutely clear NAK from my side for this 
> >> approach.
> >>
> >> You can either split up the functionality of vfio_pci_dma_buf_move() into 
> >> vfio_pci_dma_buf_invalidate_mappings() and vfio_pci_dma_buf_flush() and 
> >> then call the later whenever necessary or you keep it in one function and 
> >> block everybody until the importer has dropped all mappings.
> > 
> > No problem, I can change it to be:
> > 
> > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c 
> > b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > index d087d018d547..53772a84c93b 100644
> > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > @@ -357,23 +357,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device 
> > *vdev, bool revoked)
> >                         dma_resv_unlock(priv->dmabuf->resv);
> >                         if (revoked) {
> >                                 kref_put(&priv->kref, 
> > vfio_pci_dma_buf_done);
> > -                               /*
> > -                                * Let's wait for 1 second till all DMA 
> > unmap
> > -                                * are completed. It is supposed to catch 
> > dma-buf
> > -                                * importers which lied about their support
> > -                                * of dmabuf revoke. See 
> > dma_buf_invalidate_mappings()
> > -                                * for the expected behaviour.
> > -                                */
> > -                               wait = wait_for_completion_timeout(
> > -                                       &priv->comp, secs_to_jiffies(1));
> > -                               /*
> > -                                * If you see this WARN_ON, it means that
> > -                                * importer didn't call unmap in response to
> > -                                * dma_buf_invalidate_mappings() which is 
> > not
> > -                                * allowed.
> > -                                */
> > -                               WARN(!wait,
> > -                                    "Timed out waiting for DMABUF unmap, 
> > importer has a broken invalidate_mapping()");
> > +                               wait_for_completion(&priv->comp);
> >                         } else {
> >                                 /*
> >                                  * Kref is initialize again, because when 
> > revoke
> > 
> > Do you want me to send v6?
> 
> That would work for me.
> 
> Question is if you really want to do it this way? See usually exporters try 
> to avoid blocking such functions.
> 
> What exporters usually do instead is to grab references, e.g. call 
> pm_runtime_get_sync() when either a DMA-buf, a DMA-buf attachment or in your 
> case here a mapping of this attachment is made.

I view this as an enhancement that can be addressed later down the road.

> 
> But all of this is just a suggestion, if you are fine with blocking then feel 
> free to add my rb.

It is fine for initial version. We need to start somewhere.

Thanks

> 
> Regards,
> Christian.
> 
> > 
> > Thanks
> > 
> >>
> >>> +                 } else {
> >>> +                         /*
> >>> +                          * Kref is initialize again, because when revoke
> >>> +                          * was performed the reference counter was 
> >>> decreased
> >>> +                          * to zero to trigger completion.
> >>> +                          */
> >>> +                         kref_init(&priv->kref);
> >>> +                         /*
> >>> +                          * There is no need to wait as no mapping was
> >>> +                          * performed when the previous status was
> >>> +                          * priv->revoked == true.
> >>> +                          */
> >>> +                         reinit_completion(&priv->comp);
> >>> +                 }
> >>>           }
> >>>           fput(priv->dmabuf->file);
> >>
> >> This is also extremely questionable. Why doesn't the dmabuf have a 
> >> reference while on the linked list?
> >>
> >> Regards,
> >> Christian.
> >>
> >>>   }
> >>> @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct 
> >>> vfio_pci_core_device *vdev)
> >>>  
> >>>   down_write(&vdev->memory_lock);
> >>>   list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> >>> +         unsigned long wait;
> >>> +
> >>>           if (!get_file_active(&priv->dmabuf->file))
> >>>                   continue;
> >>>  
> >>> @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct 
> >>> vfio_pci_core_device *vdev)
> >>>           priv->vdev = NULL;
> >>>           priv->revoked = true;
> >>>           dma_buf_invalidate_mappings(priv->dmabuf);
> >>> +         dma_resv_wait_timeout(priv->dmabuf->resv,
> >>> +                               DMA_RESV_USAGE_BOOKKEEP, false,
> >>> +                               MAX_SCHEDULE_TIMEOUT);
> >>>           dma_resv_unlock(priv->dmabuf->resv);
> >>> +         kref_put(&priv->kref, vfio_pci_dma_buf_done);
> >>> +         wait = wait_for_completion_timeout(&priv->comp,
> >>> +                                            secs_to_jiffies(1));
> >>> +         WARN_ON(!wait);
> >>>           vfio_device_put_registration(&vdev->vdev);
> >>>           fput(priv->dmabuf->file);
> >>>   }
> >>>
> >>
> >>
> 
> 

Reply via email to