On Tue, Apr 30, 2024 at 04:24:50PM -0600, Alex Williamson wrote:
> > +static vm_fault_t vfio_pci_dma_buf_fault(struct vm_fault *vmf)
> > +{
> > +   struct vm_area_struct *vma = vmf->vma;
> > +   struct vfio_pci_dma_buf *priv = vma->vm_private_data;
> > +   pgoff_t pgoff = vmf->pgoff;
> > +
> > +   if (pgoff >= priv->nr_pages)
> > +           return VM_FAULT_SIGBUS;
> > +
> > +   return vmf_insert_pfn(vma, vmf->address,
> > +                         page_to_pfn(priv->pages[pgoff]));
> > +}
> 
> How does this prevent the MMIO space from being mmap'd when disabled at
> the device?  How is the mmap revoked when the MMIO becomes disabled?
> Is it part of the move protocol?

Yes, we should not have a mmap handler for dmabuf. vfio memory must be
mmapped in the normal way.

> > +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> > +{
> > +   struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > +
> > +   /*
> > +    * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> > +    * The refcount prevents both.
> > +    */
> > +   if (priv->vdev) {
> > +           release_p2p_pages(priv, priv->nr_pages);
> > +           kfree(priv->pages);
> > +           down_write(&priv->vdev->memory_lock);
> > +           list_del_init(&priv->dmabufs_elm);
> > +           up_write(&priv->vdev->memory_lock);
> 
> Why are we acquiring and releasing the memory_lock write lock
> throughout when we're not modifying the device memory enable state?
> Ugh, we're using it to implicitly lock dmabufs_elm/dmabufs aren't we...

Not really implicitly, but yes the dmabufs list is locked by the
memory_lock.

> > +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 
> > flags,
> > +                             struct vfio_device_feature_dma_buf __user 
> > *arg,
> > +                             size_t argsz)
> > +{
> > +   struct vfio_device_feature_dma_buf get_dma_buf;
> > +   struct vfio_region_p2p_area *p2p_areas;
> > +   DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> > +   struct vfio_pci_dma_buf *priv;
> > +   int i, ret;
> > +
> > +   ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> > +                            sizeof(get_dma_buf));
> > +   if (ret != 1)
> > +           return ret;
> > +
> > +   if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
> > +           return -EFAULT;
> > +
> > +   p2p_areas = memdup_array_user(&arg->p2p_areas,
> > +                                 get_dma_buf.nr_areas,
> > +                                 sizeof(*p2p_areas));
> > +   if (IS_ERR(p2p_areas))
> > +           return PTR_ERR(p2p_areas);
> > +
> > +   priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> > +   if (!priv)
> > +           return -ENOMEM;
> 
> p2p_areas is leaked.

What is this new p2p_areas thing? It wasn't in my patch..

> > +   exp_info.ops = &vfio_pci_dmabuf_ops;
> > +   exp_info.size = priv->nr_pages << PAGE_SHIFT;
> > +   exp_info.flags = get_dma_buf.open_flags;
> 
> open_flags from userspace are unchecked.

Huh. That seems to be a dmabuf pattern. :\

> > +   exp_info.priv = priv;
> > +
> > +   priv->dmabuf = dma_buf_export(&exp_info);
> > +   if (IS_ERR(priv->dmabuf)) {
> > +           ret = PTR_ERR(priv->dmabuf);
> > +           goto err_free_pages;
> > +   }
> > +
> > +   /* dma_buf_put() now frees priv */
> > +   INIT_LIST_HEAD(&priv->dmabufs_elm);
> > +   down_write(&vdev->memory_lock);
> > +   dma_resv_lock(priv->dmabuf->resv, NULL);
> > +   priv->revoked = !__vfio_pci_memory_enabled(vdev);
> > +   vfio_device_try_get_registration(&vdev->vdev);
> 
> I guess we're assuming this can't fail in the ioctl path of an open
> device?

Seems like a bug added here.. My version had this as
vfio_device_get(). This stuff has probably changed since I wrote it.

> > +   list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> > +   dma_resv_unlock(priv->dmabuf->resv);
> 
> What was the purpose of locking this?

?

> > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > +{
> > +   struct vfio_pci_dma_buf *priv;
> > +   struct vfio_pci_dma_buf *tmp;
> > +
> > +   lockdep_assert_held_write(&vdev->memory_lock);
> > +
> > +   list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > +           if (!get_file_rcu(&priv->dmabuf->file))
> > +                   continue;
> 
> Does this indicate the file was closed?

Yes.. The original patch was clearer, Christian asked to open
code it:

+ * Returns true if a reference was successfully obtained. The caller must
+ * interlock with the dmabuf's release function in some way, such as RCU, to
+ * ensure that this is not called on freed memory.

A description of how the locking is working should be put in a comment
above that code.

> > @@ -623,6 +625,8 @@ static int vfio_basic_config_write(struct 
> > vfio_pci_core_device *vdev, int pos,
> >             *virt_cmd &= cpu_to_le16(~mask);
> >             *virt_cmd |= cpu_to_le16(new_cmd & mask);
> >  
> > +           if (__vfio_pci_memory_enabled(vdev))
> > +                   vfio_pci_dma_buf_move(vdev, false);
> >             up_write(&vdev->memory_lock);
> >     }
> 
> FLR is also accessible through config space.

That needs fixing up

> > @@ -1246,7 +1248,10 @@ static int vfio_pci_ioctl_reset(struct 
> > vfio_pci_core_device *vdev,
> >      */
> >     vfio_pci_set_power_state(vdev, PCI_D0);
> >  
> > +   vfio_pci_dma_buf_move(vdev, true);
> >     ret = pci_try_reset_function(vdev->pdev);
> > +   if (__vfio_pci_memory_enabled(vdev))
> > +           vfio_pci_dma_buf_move(vdev, false);
> >     up_write(&vdev->memory_lock);
> >  
> 
> What about runtime power management?

Yes

Yes, I somehow thing it was added

> > -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 
> > flags,
> > -                                  uuid_t __user *arg, size_t argsz)
> > +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev,
> > +                                  u32 flags, uuid_t __user *arg,
> > +                                  size_t argsz)
> >  {
> > -   struct vfio_pci_core_device *vdev =
> > -           container_of(device, struct vfio_pci_core_device, vdev);
> 
> Why is only this existing function updated?  If the core device deref
> is common then apply it to all and do so in a separate patch.  Thanks,

Hm, I think that was som rebasing issue.

> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > + * region selected.
> > + *
> > + * open_flags are the typical flags passed to open(2), eg O_RDWR, 
> > O_CLOEXEC,
> > + * etc. offset/length specify a slice of the region to create the dmabuf 
> > from.
> > + * If both are 0 then the whole region is used.
> > + *
> > + * Return: The fd number on success, -1 and errno is set on failure.
> > + */
> > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> > +
> > +struct vfio_region_p2p_area {
> > +   __u32   region_index;
> > +   __u32   __pad;
> > +   __u64   offset;
> > +   __u64   length;
> > +};
> > +
> > +struct vfio_device_feature_dma_buf {
> > +   __u32   open_flags;
> > +   __u32   nr_areas;
> > +   struct vfio_region_p2p_area p2p_areas[];
> > +};

Still have no clue what this p2p areas is. You want to create a dmabuf
out of a scatterlist? Why??

I'm also not sure of the use of the pci_p2pdma family of functions, it
is a bold step to make struct pages, that isn't going to work in quite
alot of cases. It is really hacky/wrong to immediately call
pci_alloc_p2pmem() to defeat the internal genalloc.

I'd rather we stick with the original design. Leon is working on DMA
API changes that should address half the issue.

Jason

Reply via email to