Re: [Qemu-devel] [PATCH 3/3] vfio/pci: Add ioeventfd support

2018-03-15 Thread Alex Williamson
On Wed, 7 Mar 2018 13:56:44 +0800
Peter Xu  wrote:

> On Wed, Feb 28, 2018 at 01:15:20PM -0700, Alex Williamson wrote:
> 
> [...]
> 
> > @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
> > struct pci_device_id *id)
> > vdev->irq_type = VFIO_PCI_NUM_IRQS;
> > mutex_init(>igate);
> > spin_lock_init(>irqlock);
> > +   mutex_init(>ioeventfds_lock);  
> 
> Do we better need to destroy the mutex in vfio_pci_remove?
> 
> I see that vfio_pci_device.igate is also without a destructor.  I'm
> not sure on both.

Yeah, mutex_destroy() is purely for debugging and I must have missed it
when implementing vfio.  I'll add it in the remove function and try to
cleanup the others in a separate patch, at some point.  Thanks,

Alex



Re: [Qemu-devel] [PATCH 3/3] vfio/pci: Add ioeventfd support

2018-03-15 Thread Alex Williamson
On Tue, 13 Mar 2018 14:12:34 +0100
Auger Eric  wrote:
> On 28/02/18 21:15, Alex Williamson wrote:
> > +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> > +   uint64_t data, int count, int fd)
> > +{
> > +   struct pci_dev *pdev = vdev->pdev;
> > +   loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> > +   int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> > +   struct vfio_pci_ioeventfd *ioeventfd;
> > +   int (*handler)(void *addr, void *value);
> > +
> > +   /* Only support ioeventfds into BARs */
> > +   if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> > +   return -EINVAL;
> > +
> > +   if (pos + count > pci_resource_len(pdev, bar))
> > +   return -EINVAL;
> > +
> > +   /* Disallow ioeventfds working around MSI-X table writes */
> > +   if (bar == vdev->msix_bar &&
> > +   !(pos + count <= vdev->msix_offset ||
> > + pos >= vdev->msix_offset + vdev->msix_size))
> > +   return -EINVAL;
> > +
> > +   switch (count) {
> > +   case 1:
> > +   handler = _pci_ioeventfd_handler8;
> > +   break;
> > +   case 2:
> > +   handler = _pci_ioeventfd_handler16;
> > +   break;
> > +   case 4:
> > +   handler = _pci_ioeventfd_handler32;
> > +   break;
> > +#ifdef iowrite64
> > +   case 8:
> > +   handler = _pci_ioeventfd_handler64;
> > +   break;  
> from a user point of view, it is straightforward this setup will be
> rejected? This is not documented in the uapi at the moment.

I added a mention in the uapi, do you see any need for more?
Essentially I consider this an entirely optional accelerator, bus
drivers are free to implement as much or little as they want.
Userspace can clearly make due without it, we've gone this long, and
it's easy to reject cases we don't want to support.  Thanks,

Alex



Re: [Qemu-devel] [PATCH 3/3] vfio/pci: Add ioeventfd support

2018-03-13 Thread Auger Eric
Hi Alex,

On 28/02/18 21:15, Alex Williamson wrote:
> The ioeventfd here is actually irqfd handling of an ioeventfd such as
> supported in KVM.  A user is able to pre-program a device write to
> occur when the eventfd triggers.  This is yet another instance of
> eventfd-irqfd triggering between KVM and vfio.  The impetus for this
> is high frequency writes to pages which are virtualized in QEMU.
> Enabling this near-direct write path for selected registers within
> the virtualized page can improve performance and reduce overhead.
> Specifically this is initially targeted at NVIDIA graphics cards where
> the driver issues a write to an MMIO register within a virtualized
> region in order to allow the MSI interrupt to re-trigger.
> 
> Signed-off-by: Alex Williamson 
> ---
>  drivers/vfio/pci/vfio_pci.c |   34 ++
>  drivers/vfio/pci/vfio_pci_private.h |   18 +
>  drivers/vfio/pci/vfio_pci_rdwr.c|  115 
> +++
>  include/uapi/linux/vfio.h   |   27 
>  4 files changed, 194 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index b0f759476900..ad18ed266dc0 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -305,6 +305,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  {
>   struct pci_dev *pdev = vdev->pdev;
>   struct vfio_pci_dummy_resource *dummy_res, *tmp;
> + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
>   int i, bar;
>  
>   /* Stop the device from further DMA */
> @@ -314,6 +315,15 @@ static void vfio_pci_disable(struct vfio_pci_device 
> *vdev)
>   VFIO_IRQ_SET_ACTION_TRIGGER,
>   vdev->irq_type, 0, 0, NULL);
>  
> + /* Device closed, don't need mutex here */
> + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
> +  >ioeventfds_list, next) {
> + vfio_virqfd_disable(>virqfd);
> + list_del(>next);
> + kfree(ioeventfd);
> + }
> + vdev->ioeventfds_nr = 0;
> +
>   vdev->virq_disabled = false;
>  
>   for (i = 0; i < vdev->num_regions; i++)
> @@ -1012,6 +1022,28 @@ static long vfio_pci_ioctl(void *device_data,
>  
>   kfree(groups);
>   return ret;
> + } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
> + struct vfio_device_ioeventfd ioeventfd;
> + int count;
> +
> + minsz = offsetofend(struct vfio_device_ioeventfd, fd);
> +
> + if (copy_from_user(, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (ioeventfd.argsz < minsz)
> + return -EINVAL;
> +
> + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
> + return -EINVAL;
> +
> + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
> +
> + if (hweight8(count) != 1 || ioeventfd.fd < -1)
> + return -EINVAL;
> +
> + return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
> +   ioeventfd.data, count, ioeventfd.fd);
>   }
>  
>   return -ENOTTY;
> @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
> struct pci_device_id *id)
>   vdev->irq_type = VFIO_PCI_NUM_IRQS;
>   mutex_init(>igate);
>   spin_lock_init(>irqlock);
> + mutex_init(>ioeventfds_lock);
> + INIT_LIST_HEAD(>ioeventfds_list);
>  
>   ret = vfio_add_group_dev(>dev, _pci_ops, vdev);
>   if (ret) {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h 
> b/drivers/vfio/pci/vfio_pci_private.h
> index f561ac1c78a0..33a48c3ba11c 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -29,6 +29,18 @@
>  #define PCI_CAP_ID_INVALID   0xFF/* default raw access */
>  #define PCI_CAP_ID_INVALID_VIRT  0xFE/* default virt access 
> */
>  
> +/* Cap maximum number of ioeventfds per device (arbitrary) */
> +#define VFIO_PCI_IOEVENTFD_MAX   1000
> +
> +struct vfio_pci_ioeventfd {
> + struct list_headnext;
> + struct virqfd   *virqfd;
> + loff_t  pos;
> + uint64_tdata;
> + int bar;
> + int count;
> +};
> +
>  struct vfio_pci_irq_ctx {
>   struct eventfd_ctx  *trigger;
>   struct virqfd   *unmask;
> @@ -92,9 +104,12 @@ struct vfio_pci_device {
>   boolnointx;
>   struct pci_saved_state  *pci_saved_state;
>   int refcnt;
> + int ioeventfds_nr;
>   struct eventfd_ctx  *err_trigger;
>   struct eventfd_ctx  *req_trigger;
>   struct list_headdummy_resources_list;
> + struct mutexioeventfds_lock;
> + struct list_head

Re: [Qemu-devel] [PATCH 3/3] vfio/pci: Add ioeventfd support

2018-03-06 Thread Peter Xu
On Wed, Feb 28, 2018 at 01:15:20PM -0700, Alex Williamson wrote:

[...]

> @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
> struct pci_device_id *id)
>   vdev->irq_type = VFIO_PCI_NUM_IRQS;
>   mutex_init(>igate);
>   spin_lock_init(>irqlock);
> + mutex_init(>ioeventfds_lock);

Do we better need to destroy the mutex in vfio_pci_remove?

I see that vfio_pci_device.igate is also without a destructor.  I'm
not sure on both.

Thanks,

> + INIT_LIST_HEAD(>ioeventfds_list);
>  
>   ret = vfio_add_group_dev(>dev, _pci_ops, vdev);
>   if (ret) {

-- 
Peter Xu



Re: [Qemu-devel] [PATCH 3/3] vfio/pci: Add ioeventfd support

2018-03-05 Thread kbuild test robot
Hi Alex,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v4.16-rc4 next-20180306]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Alex-Williamson/vfio-pci-Pull-BAR-mapping-setup-from-read-write-path/20180303-015851
reproduce:
# apt-get install sparse
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> drivers/vfio/pci/vfio_pci_rdwr.c:290:1: sparse: incorrect type in argument 2 
>> (different address spaces) @@expected void [noderef] * 
>> @@got sn:2>* @@
   drivers/vfio/pci/vfio_pci_rdwr.c:290:1:expected void [noderef] 
*
   drivers/vfio/pci/vfio_pci_rdwr.c:290:1:got void *opaque
   drivers/vfio/pci/vfio_pci_rdwr.c:291:1: sparse: incorrect type in argument 2 
(different address spaces) @@expected void [noderef] * @@   
 got sn:2>* @@
   drivers/vfio/pci/vfio_pci_rdwr.c:291:1:expected void [noderef] 
*
   drivers/vfio/pci/vfio_pci_rdwr.c:291:1:got void *opaque
   drivers/vfio/pci/vfio_pci_rdwr.c:292:1: sparse: incorrect type in argument 2 
(different address spaces) @@expected void [noderef] * @@   
 got sn:2>* @@
   drivers/vfio/pci/vfio_pci_rdwr.c:292:1:expected void [noderef] 
*
   drivers/vfio/pci/vfio_pci_rdwr.c:292:1:got void *opaque
>> drivers/vfio/pci/vfio_pci_rdwr.c:378:52: sparse: incorrect type in argument 
>> 1 (different address spaces) @@expected void *opaque @@got void 
>> [noderef] *

vim +290 drivers/vfio/pci/vfio_pci_rdwr.c

   286  
   287  #ifdef iowrite64
   288  VFIO_PCI_IOEVENTFD_HANDLER(64)
   289  #endif
 > 290  VFIO_PCI_IOEVENTFD_HANDLER(32)
   291  VFIO_PCI_IOEVENTFD_HANDLER(16)
   292  VFIO_PCI_IOEVENTFD_HANDLER(8)
   293  
   294  long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
   295  uint64_t data, int count, int fd)
   296  {
   297  struct pci_dev *pdev = vdev->pdev;
   298  loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
   299  int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
   300  struct vfio_pci_ioeventfd *ioeventfd;
   301  int (*handler)(void *addr, void *value);
   302  
   303  /* Only support ioeventfds into BARs */
   304  if (bar > VFIO_PCI_BAR5_REGION_INDEX)
   305  return -EINVAL;
   306  
   307  if (pos + count > pci_resource_len(pdev, bar))
   308  return -EINVAL;
   309  
   310  /* Disallow ioeventfds working around MSI-X table writes */
   311  if (bar == vdev->msix_bar &&
   312  !(pos + count <= vdev->msix_offset ||
   313pos >= vdev->msix_offset + vdev->msix_size))
   314  return -EINVAL;
   315  
   316  switch (count) {
   317  case 1:
   318  handler = _pci_ioeventfd_handler8;
   319  break;
   320  case 2:
   321  handler = _pci_ioeventfd_handler16;
   322  break;
   323  case 4:
   324  handler = _pci_ioeventfd_handler32;
   325  break;
   326  #ifdef iowrite64
   327  case 8:
   328  handler = _pci_ioeventfd_handler64;
   329  break;
   330  #endif
   331  default:
   332  return -EINVAL;
   333  }
   334  
   335  ret = vfio_pci_setup_barmap(vdev, bar);
   336  if (ret)
   337  return ret;
   338  
   339  mutex_lock(>ioeventfds_lock);
   340  
   341  list_for_each_entry(ioeventfd, >ioeventfds_list, next) {
   342  if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
   343  ioeventfd->data == data && ioeventfd->count == 
count) {
   344  if (fd == -1) {
   345  vfio_virqfd_disable(>virqfd);
   346  list_del(>next);
   347  vdev->ioeventfds_nr--;
   348  kfree(ioeventfd);
   349  ret = 0;
   350  } else
   351  ret = -EEXIST;
   352  
   353  goto out_unlock;
   354  }
   355  }
   356  
   357  if (fd < 0) {
   358  ret = -ENODEV;
   359  goto out_unlock;
   360  }
   361  
   362  if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) {
   363  ret = -ENOSPC;
   364  goto out_unlock;
   365  }
   366  
   367  ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
   368  if (!ioeventfd) {
   369  ret = -ENOMEM;
   370  goto out_unlock;
   371

[Qemu-devel] [PATCH 3/3] vfio/pci: Add ioeventfd support

2018-02-28 Thread Alex Williamson
The ioeventfd here is actually irqfd handling of an ioeventfd such as
supported in KVM.  A user is able to pre-program a device write to
occur when the eventfd triggers.  This is yet another instance of
eventfd-irqfd triggering between KVM and vfio.  The impetus for this
is high frequency writes to pages which are virtualized in QEMU.
Enabling this near-direct write path for selected registers within
the virtualized page can improve performance and reduce overhead.
Specifically this is initially targeted at NVIDIA graphics cards where
the driver issues a write to an MMIO register within a virtualized
region in order to allow the MSI interrupt to re-trigger.

Signed-off-by: Alex Williamson 
---
 drivers/vfio/pci/vfio_pci.c |   34 ++
 drivers/vfio/pci/vfio_pci_private.h |   18 +
 drivers/vfio/pci/vfio_pci_rdwr.c|  115 +++
 include/uapi/linux/vfio.h   |   27 
 4 files changed, 194 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b0f759476900..ad18ed266dc0 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -305,6 +305,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 {
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_dummy_resource *dummy_res, *tmp;
+   struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
int i, bar;
 
/* Stop the device from further DMA */
@@ -314,6 +315,15 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
VFIO_IRQ_SET_ACTION_TRIGGER,
vdev->irq_type, 0, 0, NULL);
 
+   /* Device closed, don't need mutex here */
+   list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
+>ioeventfds_list, next) {
+   vfio_virqfd_disable(>virqfd);
+   list_del(>next);
+   kfree(ioeventfd);
+   }
+   vdev->ioeventfds_nr = 0;
+
vdev->virq_disabled = false;
 
for (i = 0; i < vdev->num_regions; i++)
@@ -1012,6 +1022,28 @@ static long vfio_pci_ioctl(void *device_data,
 
kfree(groups);
return ret;
+   } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
+   struct vfio_device_ioeventfd ioeventfd;
+   int count;
+
+   minsz = offsetofend(struct vfio_device_ioeventfd, fd);
+
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (ioeventfd.argsz < minsz)
+   return -EINVAL;
+
+   if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
+   return -EINVAL;
+
+   count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
+
+   if (hweight8(count) != 1 || ioeventfd.fd < -1)
+   return -EINVAL;
+
+   return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
+ ioeventfd.data, count, ioeventfd.fd);
}
 
return -ENOTTY;
@@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(>igate);
spin_lock_init(>irqlock);
+   mutex_init(>ioeventfds_lock);
+   INIT_LIST_HEAD(>ioeventfds_list);
 
ret = vfio_add_group_dev(>dev, _pci_ops, vdev);
if (ret) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index f561ac1c78a0..33a48c3ba11c 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -29,6 +29,18 @@
 #define PCI_CAP_ID_INVALID 0xFF/* default raw access */
 #define PCI_CAP_ID_INVALID_VIRT0xFE/* default virt access 
*/
 
+/* Cap maximum number of ioeventfds per device (arbitrary) */
+#define VFIO_PCI_IOEVENTFD_MAX 1000
+
+struct vfio_pci_ioeventfd {
+   struct list_headnext;
+   struct virqfd   *virqfd;
+   loff_t  pos;
+   uint64_tdata;
+   int bar;
+   int count;
+};
+
 struct vfio_pci_irq_ctx {
struct eventfd_ctx  *trigger;
struct virqfd   *unmask;
@@ -92,9 +104,12 @@ struct vfio_pci_device {
boolnointx;
struct pci_saved_state  *pci_saved_state;
int refcnt;
+   int ioeventfds_nr;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx  *req_trigger;
struct list_headdummy_resources_list;
+   struct mutexioeventfds_lock;
+   struct list_headioeventfds_list;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
@@ -120,6 +135,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device 
*vdev, char