from:"Michael S. Tsirkin"

Re: Unable to create more than 1 guest virtio-net device using vhost-net backend

2010-03-22 Thread Michael S. Tsirkin

On Sun, Mar 21, 2010 at 01:58:29PM +0200, Avi Kivity wrote:
 On 03/21/2010 01:34 PM, Michael S. Tsirkin wrote:
 On Sun, Mar 21, 2010 at 12:29:31PM +0200, Avi Kivity wrote:

 On 03/21/2010 12:15 PM, Michael S. Tsirkin wrote:
  
 Nothing easy that I can see. Each device needs 2 of these.  Avi, Gleb,
 any objections to increasing the limit to say 16?  That would give us
 5 more devices to the limit of 6 per guest.



 Increase it to 200, then.

  
 OK. I think we'll also need a smarter allocator
 than bus-dev_count++ than we now have. Right?


 No, why?
  
 We'll run into problems if devices are created/removed in random order,
 won't we?


 unregister_dev() takes care of it.

 Eventually we'll want faster scanning than the linear search we employ
 now, though.
  
 Yes I suspect with 200 entries we will :). Let's just make it 16 for
 now?


 Let's make it 200 and fix the performance problems later.  Making it 16  
 is just asking for trouble.

I did this and performance with vhost seems to become much more noisy,
and drop by about 10% on average, even though in practice only
a single device is created. Still trying to figure it out ...
Any idea?

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
 This patch adds a driver for my shared memory PCI device using the uio_pci
 interface.  The driver has three memory regions.  The first memory region is 
 for
 device registers for sending interrupts. The second BAR is for receiving MSI-X
 interrupts and the third memory region maps the shared memory.  The device 
 only
 exports the first and third memory regions to userspace.
 
 This driver supports MSI-X and regular pin interrupts.  Currently, the number 
 of
 MSI vectors is set to 4 which could be increased, but the driver will work 
 with
 fewer vectors.  If MSI is not available, then regular interrupts will be used.
 ---
  drivers/uio/Kconfig   |8 ++
  drivers/uio/Makefile  |1 +
  drivers/uio/uio_ivshmem.c |  235 
 +
  3 files changed, 244 insertions(+), 0 deletions(-)
  create mode 100644 drivers/uio/uio_ivshmem.c
 
 diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
 index 1da73ec..b92cded 100644
 --- a/drivers/uio/Kconfig
 +++ b/drivers/uio/Kconfig
 @@ -74,6 +74,14 @@ config UIO_SERCOS3
  
 If you compile this as a module, it will be called uio_sercos3.
  
 +config UIO_IVSHMEM
 + tristate KVM shared memory PCI driver
 + default n
 + help
 +   Userspace I/O interface for the KVM shared memory device.  This
 +   driver will make available two memory regions, the first is
 +   registers and the second is a region for sharing between VMs.
 +
  config UIO_PCI_GENERIC
   tristate Generic driver for PCI 2.3 and PCI Express cards
   depends on PCI
 diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
 index 18fd818..25c1ca5 100644
 --- a/drivers/uio/Makefile
 +++ b/drivers/uio/Makefile
 @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
  obj-$(CONFIG_UIO_SERCOS3)+= uio_sercos3.o
  obj-$(CONFIG_UIO_PCI_GENERIC)+= uio_pci_generic.o
  obj-$(CONFIG_UIO_NETX)   += uio_netx.o
 +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
 diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
 new file mode 100644
 index 000..607435b
 --- /dev/null
 +++ b/drivers/uio/uio_ivshmem.c
 @@ -0,0 +1,235 @@
 +/*
 + * UIO IVShmem Driver
 + *
 + * (C) 2009 Cam Macdonell
 + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch 
 h...@linutronix.de
 + *
 + * Licensed under GPL version 2 only.
 + *
 + */
 +
 +#include linux/device.h
 +#include linux/module.h
 +#include linux/pci.h
 +#include linux/uio_driver.h
 +
 +#include asm/io.h
 +
 +#define IntrStatus 0x04
 +#define IntrMask 0x00
 +
 +struct ivshmem_info {
 +struct uio_info *uio;
 +struct pci_dev *dev;
 +char (*msix_names)[256];
 +struct msix_entry *msix_entries;
 +int nvectors;
 +};
 +
 +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
 +{
 +
 +void __iomem *plx_intscr = dev_info-mem[0].internal_addr
 ++ IntrStatus;
 +u32 val;
 +
 +val = readl(plx_intscr);
 +if (val == 0)
 +return IRQ_NONE;
 +
 +printk(KERN_INFO Regular interrupt (val = %d)\n, val);
 +return IRQ_HANDLED;
 +}
 +
 +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
 +{
 +
 +struct uio_info * dev_info = (struct uio_info *) opaque;
 +
 +/* we have to do this explicitly when using MSI-X */
 +uio_event_notify(dev_info);

How does userspace know which vector was triggered?

 +printk(KERN_INFO MSI-X interrupt (%d)\n, irq);
 +return IRQ_HANDLED;
 +

extra empty line

 +}
 +
 +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
 +{
 +int i, err;
 +const char *name = ivshmem;
 +
 +printk(KERN_INFO devname is %s\n, name);

These KERN_INFO messages need to be cleaned up, they would be
look confusing in the log.

 +ivs_info-nvectors = nvectors;
 +
 +
 +ivs_info-msix_entries = kmalloc(nvectors * sizeof 
 *ivs_info-msix_entries,
 +GFP_KERNEL);
 +ivs_info-msix_names = kmalloc(nvectors * sizeof *ivs_info-msix_names,
 +GFP_KERNEL);
 +

need to handle errors

 +for (i = 0; i  nvectors; ++i)
 +ivs_info-msix_entries[i].entry = i;
 +
 +err = pci_enable_msix(ivs_info-dev, ivs_info-msix_entries,
 +ivs_info-nvectors);
 +if (err  0) {
 +ivs_info-nvectors = err; /* msi-x positive error code
 + returns the number available*/
 +err = pci_enable_msix(ivs_info-dev, ivs_info-msix_entries,
 +ivs_info-nvectors);
 +if (err  0) {
 +printk(KERN_INFO no MSI (%d). Back to INTx.\n, err);
 +return -ENOSPC;
 +}
 +}

we can also get err  0.

 +
 +printk(KERN_INFO err is %d\n, err);
 +if (err) return err;

coding style rule violation

 +
 +for (i = 0; i  ivs_info-nvectors; i++) {
 +
 +snprintf(ivs_info-msix_names[i], sizeof *ivs_info-msix_names,
 +%s-config, name);
 +
 +

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
 This patch adds a driver for my shared memory PCI device using the uio_pci
 interface.  The driver has three memory regions.  The first memory region is 
 for
 device registers for sending interrupts. The second BAR is for receiving MSI-X
 interrupts and the third memory region maps the shared memory.  The device 
 only
 exports the first and third memory regions to userspace.
 
 This driver supports MSI-X and regular pin interrupts.  Currently, the number 
 of
 MSI vectors is set to 4 which could be increased, but the driver will work 
 with
 fewer vectors.  If MSI is not available, then regular interrupts will be used.

Some high level questions, sorry if they have been raised in the past:
- Can this device use virtio framework?
  This gives us some standards to work off, with feature negotiation,
  ability to detect config changes, support for non-PCI
  platforms, decent documentation that is easy to extend,
  legal id range to use.
  You would thus have your driver in uio/uio_virtio_shmem.c

- Why are you using 32 bit long memory accesses for interrupts?
  16 bit IO eould be enough and it's faster. This what virtio-pci does.

- How was the driver tested?

 ---
  drivers/uio/Kconfig   |8 ++
  drivers/uio/Makefile  |1 +
  drivers/uio/uio_ivshmem.c |  235 
 +
  3 files changed, 244 insertions(+), 0 deletions(-)
  create mode 100644 drivers/uio/uio_ivshmem.c
 
 diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
 index 1da73ec..b92cded 100644
 --- a/drivers/uio/Kconfig
 +++ b/drivers/uio/Kconfig
 @@ -74,6 +74,14 @@ config UIO_SERCOS3
  
 If you compile this as a module, it will be called uio_sercos3.
  
 +config UIO_IVSHMEM
 + tristate KVM shared memory PCI driver
 + default n
 + help
 +   Userspace I/O interface for the KVM shared memory device.  This
 +   driver will make available two memory regions, the first is
 +   registers and the second is a region for sharing between VMs.
 +
  config UIO_PCI_GENERIC
   tristate Generic driver for PCI 2.3 and PCI Express cards
   depends on PCI
 diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
 index 18fd818..25c1ca5 100644
 --- a/drivers/uio/Makefile
 +++ b/drivers/uio/Makefile
 @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
  obj-$(CONFIG_UIO_SERCOS3)+= uio_sercos3.o
  obj-$(CONFIG_UIO_PCI_GENERIC)+= uio_pci_generic.o
  obj-$(CONFIG_UIO_NETX)   += uio_netx.o
 +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
 diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
 new file mode 100644
 index 000..607435b
 --- /dev/null
 +++ b/drivers/uio/uio_ivshmem.c
 @@ -0,0 +1,235 @@
 +/*
 + * UIO IVShmem Driver
 + *
 + * (C) 2009 Cam Macdonell
 + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch 
 h...@linutronix.de
 + *
 + * Licensed under GPL version 2 only.
 + *
 + */
 +
 +#include linux/device.h
 +#include linux/module.h
 +#include linux/pci.h
 +#include linux/uio_driver.h
 +
 +#include asm/io.h
 +
 +#define IntrStatus 0x04
 +#define IntrMask 0x00
 +
 +struct ivshmem_info {
 +struct uio_info *uio;
 +struct pci_dev *dev;
 +char (*msix_names)[256];
 +struct msix_entry *msix_entries;
 +int nvectors;
 +};
 +
 +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
 +{
 +
 +void __iomem *plx_intscr = dev_info-mem[0].internal_addr
 ++ IntrStatus;
 +u32 val;
 +
 +val = readl(plx_intscr);
 +if (val == 0)
 +return IRQ_NONE;
 +
 +printk(KERN_INFO Regular interrupt (val = %d)\n, val);
 +return IRQ_HANDLED;
 +}
 +
 +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
 +{
 +
 +struct uio_info * dev_info = (struct uio_info *) opaque;
 +
 +/* we have to do this explicitly when using MSI-X */
 +uio_event_notify(dev_info);
 +printk(KERN_INFO MSI-X interrupt (%d)\n, irq);
 +return IRQ_HANDLED;
 +
 +}
 +
 +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
 +{
 +int i, err;
 +const char *name = ivshmem;
 +
 +printk(KERN_INFO devname is %s\n, name);
 +ivs_info-nvectors = nvectors;
 +
 +
 +ivs_info-msix_entries = kmalloc(nvectors * sizeof 
 *ivs_info-msix_entries,
 +GFP_KERNEL);
 +ivs_info-msix_names = kmalloc(nvectors * sizeof *ivs_info-msix_names,
 +GFP_KERNEL);
 +
 +for (i = 0; i  nvectors; ++i)
 +ivs_info-msix_entries[i].entry = i;
 +
 +err = pci_enable_msix(ivs_info-dev, ivs_info-msix_entries,
 +ivs_info-nvectors);
 +if (err  0) {
 +ivs_info-nvectors = err; /* msi-x positive error code
 + returns the number available*/
 +err = pci_enable_msix(ivs_info-dev, ivs_info-msix_entries,
 +ivs_info-nvectors);
 +if (err  0) {
 +printk(KERN_INFO no MSI (%d). Back to

Re: [PATCH v3 0/2] Inter-VM shared memory PCI device

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 11:04:54AM +0200, Avi Kivity wrote:
 Again, I recommend Rusty's virtio-pci for inspiration.

Not just inspiration, how about building on virtio-pci?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 11:40:09AM +0200, Avi Kivity wrote:
 On 03/25/2010 11:15 AM, Michael S. Tsirkin wrote:

 - Why are you using 32 bit long memory accesses for interrupts?
16 bit IO eould be enough and it's faster. This what virtio-pci does.



 Why is 16 bit io faster?

Something to do with need for emulation to get address/data
for pci memory accesses?

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 11:58:30AM +0200, Avi Kivity wrote:
 On 03/25/2010 11:44 AM, Michael S. Tsirkin wrote:
 On Thu, Mar 25, 2010 at 11:40:09AM +0200, Avi Kivity wrote:

 On 03/25/2010 11:15 AM, Michael S. Tsirkin wrote:
  
 - Why are you using 32 bit long memory accesses for interrupts?
 16 bit IO eould be enough and it's faster. This what virtio-pci does.



 Why is 16 bit io faster?
  
 Something to do with need for emulation to get address/data
 for pci memory accesses?


 pio is definitely faster than mmio (all that is needed is to set one bit  
 on the BAR).  But 32 vs. 16 makes no difference.

Right. That's what I meant.

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 11:23:05AM -0500, Anthony Liguori wrote:
 On 03/25/2010 11:18 AM, Cam Macdonell wrote:
 On Thu, Mar 25, 2010 at 3:15 AM, Michael S. Tsirkinm...@redhat.com  wrote:

 On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
  
 This patch adds a driver for my shared memory PCI device using the uio_pci
 interface.  The driver has three memory regions.  The first memory region 
 is for
 device registers for sending interrupts. The second BAR is for receiving 
 MSI-X
 interrupts and the third memory region maps the shared memory.  The device 
 only
 exports the first and third memory regions to userspace.

 This driver supports MSI-X and regular pin interrupts.  Currently, the 
 number of
 MSI vectors is set to 4 which could be increased, but the driver will work 
 with
 fewer vectors.  If MSI is not available, then regular interrupts will be 
 used.

 Some high level questions, sorry if they have been raised in the past:
 - Can this device use virtio framework?
   This gives us some standards to work off, with feature negotiation,
   ability to detect config changes, support for non-PCI
   platforms, decent documentation that is easy to extend,
   legal id range to use.
   You would thus have your driver in uio/uio_virtio_shmem.c
  
 There has been previous discussion of virtio, however while virtio is
 good for exporting guest memory, it's not ideal for importing memory
 into a guest.


 virtio is a DMA-based API which means that it doesn't assume cache  
 coherent shared memory.  The PCI transport takes advantage of cache  
 coherent shared memory but it's not strictly required.

 Memory sharing in virtio would be a layering violation because it forces  
 cache coherent shared memory for all virtio transports.

 Regards,

 Anthony Liguori

I am not sure I understand the argument.  We can still reuse feature
negotiation, notifications etc from virtio.  If some transports can not
support cache coherent shared memory, the device won't work there.
But it won't work there anyway, even without using virtio.
We are not forcing all devices to assume cache coherency.

In other words, let's not fall into midlayer mistake, let's
treat virtio as a library.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 10:30:42AM -0600, Cam Macdonell wrote:
 On Thu, Mar 25, 2010 at 3:05 AM, Michael S. Tsirkin m...@redhat.com wrote:
  On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
  This patch adds a driver for my shared memory PCI device using the uio_pci
  interface.  The driver has three memory regions.  The first memory region 
  is for
  device registers for sending interrupts. The second BAR is for receiving 
  MSI-X
  interrupts and the third memory region maps the shared memory.  The device 
  only
  exports the first and third memory regions to userspace.
 
  This driver supports MSI-X and regular pin interrupts.  Currently, the 
  number of
  MSI vectors is set to 4 which could be increased, but the driver will work 
  with
  fewer vectors.  If MSI is not available, then regular interrupts will be 
  used.
  ---
   drivers/uio/Kconfig       |    8 ++
   drivers/uio/Makefile      |    1 +
   drivers/uio/uio_ivshmem.c |  235 
  +
   3 files changed, 244 insertions(+), 0 deletions(-)
   create mode 100644 drivers/uio/uio_ivshmem.c
 
  diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
  index 1da73ec..b92cded 100644
  --- a/drivers/uio/Kconfig
  +++ b/drivers/uio/Kconfig
  @@ -74,6 +74,14 @@ config UIO_SERCOS3
 
          If you compile this as a module, it will be called uio_sercos3.
 
  +config UIO_IVSHMEM
  +     tristate KVM shared memory PCI driver
  +     default n
  +     help
  +       Userspace I/O interface for the KVM shared memory device.  This
  +       driver will make available two memory regions, the first is
  +       registers and the second is a region for sharing between VMs.
  +
   config UIO_PCI_GENERIC
        tristate Generic driver for PCI 2.3 and PCI Express cards
        depends on PCI
  diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
  index 18fd818..25c1ca5 100644
  --- a/drivers/uio/Makefile
  +++ b/drivers/uio/Makefile
  @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
   obj-$(CONFIG_UIO_SERCOS3)    += uio_sercos3.o
   obj-$(CONFIG_UIO_PCI_GENERIC)        += uio_pci_generic.o
   obj-$(CONFIG_UIO_NETX)       += uio_netx.o
  +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
  diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
  new file mode 100644
  index 000..607435b
  --- /dev/null
  +++ b/drivers/uio/uio_ivshmem.c
  @@ -0,0 +1,235 @@
  +/*
  + * UIO IVShmem Driver
  + *
  + * (C) 2009 Cam Macdonell
  + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch 
  h...@linutronix.de
  + *
  + * Licensed under GPL version 2 only.
  + *
  + */
  +
  +#include linux/device.h
  +#include linux/module.h
  +#include linux/pci.h
  +#include linux/uio_driver.h
  +
  +#include asm/io.h
  +
  +#define IntrStatus 0x04
  +#define IntrMask 0x00
  +
  +struct ivshmem_info {
  +    struct uio_info *uio;
  +    struct pci_dev *dev;
  +    char (*msix_names)[256];
  +    struct msix_entry *msix_entries;
  +    int nvectors;
  +};
  +
  +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
  +{
  +
  +    void __iomem *plx_intscr = dev_info-mem[0].internal_addr
  +                    + IntrStatus;
  +    u32 val;
  +
  +    val = readl(plx_intscr);
  +    if (val == 0)
  +        return IRQ_NONE;
  +
  +    printk(KERN_INFO Regular interrupt (val = %d)\n, val);
  +    return IRQ_HANDLED;
  +}
  +
  +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
  +{
  +
  +    struct uio_info * dev_info = (struct uio_info *) opaque;
  +
  +    /* we have to do this explicitly when using MSI-X */
  +    uio_event_notify(dev_info);
 
  How does userspace know which vector was triggered?
 
 Right now, it doesn't.  If a user had a particular need they would
 need to write their own uio driver.  I guess this leads to a
 discussion of MSI support in UIO and how they would work with the
 userspace.

So why request more than one vector then?

 
  +    printk(KERN_INFO MSI-X interrupt (%d)\n, irq);
  +    return IRQ_HANDLED;
  +
 
  extra empty line
 
  +}
  +
  +static int request_msix_vectors(struct ivshmem_info *ivs_info, int 
  nvectors)
  +{
  +    int i, err;
  +    const char *name = ivshmem;
  +
  +    printk(KERN_INFO devname is %s\n, name);
 
  These KERN_INFO messages need to be cleaned up, they would be
  look confusing in the log.
 
 Agreed.  I will clean most of these out.
 
 
  +    ivs_info-nvectors = nvectors;
  +
  +
  +    ivs_info-msix_entries = kmalloc(nvectors * sizeof 
  *ivs_info-msix_entries,
  +            GFP_KERNEL);
  +    ivs_info-msix_names = kmalloc(nvectors * sizeof 
  *ivs_info-msix_names,
  +            GFP_KERNEL);
  +
 
  need to handle errors
 
  +    for (i = 0; i  nvectors; ++i)
  +        ivs_info-msix_entries[i].entry = i;
  +
  +    err = pci_enable_msix(ivs_info-dev, ivs_info-msix_entries,
  +                    ivs_info-nvectors);
  +    if (err  0) {
  +        ivs_info-nvectors = err; /* msi-x positive error code

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 06:36:02PM +0200, Avi Kivity wrote:
 On 03/25/2010 11:05 AM, Michael S. Tsirkin wrote:

 +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
 +{
 +.vendor =0x1af4,
 +.device =0x1110,
  
 vendor ids must be registered with PCI SIG.
 this one does not seem to be registered.


 That's the Qumranet vendor ID.

Isn't virtio_pci matching that id already?

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 06:32:15PM +0200, Avi Kivity wrote:
 On 03/25/2010 06:23 PM, Anthony Liguori wrote:
 There has been previous discussion of virtio, however while virtio is
 good for exporting guest memory, it's not ideal for importing memory
 into a guest.

 virtio is a DMA-based API which means that it doesn't assume cache  
 coherent shared memory.  The PCI transport takes advantage of cache  
 coherent shared memory but it's not strictly required.

 Aren't we violating this by not using dma_alloc_coherent() for the queues?

I don't see what changing this would buys us though, unless
a non-cache coherent architecture implements kvm.
TCG runs everything on a single processor so there are
no cache coherency issues.

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-25 Thread Michael S. Tsirkin

On Thu, Mar 25, 2010 at 10:24:20AM -0600, Cam Macdonell wrote:
 On Thu, Mar 25, 2010 at 3:46 AM, Avi Kivity a...@redhat.com wrote:
  On 03/25/2010 08:09 AM, Cam Macdonell wrote:
 
  This patch adds a driver for my shared memory PCI device using the uio_pci
  interface.  The driver has three memory regions.  The first memory region
  is for
  device registers for sending interrupts. The second BAR is for receiving
  MSI-X
  interrupts and the third memory region maps the shared memory.  The device
  only
  exports the first and third memory regions to userspace.
 
  This driver supports MSI-X and regular pin interrupts.  Currently, the
  number of
  MSI vectors is set to 4 which could be increased, but the driver will work
  with
  fewer vectors.  If MSI is not available, then regular interrupts will be
  used.
 
 
  There is now a generic PCI 2.3 driver that can handle all PCI devices.  It
  doesn't support MSI, but if we add MSI support then it can be used without
  the need for a specialized driver.
 
 Agreed, I'd be happy to use the generic driver if MSI is there.  What
 would MSI support for UIO look like?  An array of struct uio_irq for
 the different vectors?
 
 Cam

My idea was to supply a way to bind eventfd to a vector.

 
  --
  error compiling committee.c: too many arguments to function
 
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-28 Thread Michael S. Tsirkin

On Sat, Mar 27, 2010 at 08:48:34PM +0300, Avi Kivity wrote:
 On 03/26/2010 07:14 PM, Cam Macdonell wrote:

 I'm not familiar with the uio internals, but for the interface, an ioctl()
 on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd, but
 instead of mapping a doorbell to an eventfd, it maps a real MSI to an
 eventfd.
  
 uio will never support ioctls.

 Why not?

 Maybe irqcontrol could be extended?


 What's irqcontrol?

uio accepts 32 bit writes to the char device file. We can encode
the fd number there, and use the high bit to signal assign/deassign.

 -- 
 Do not meddle in the internals of kernels, for they are subtle and quick to 
 panic.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-28 Thread Michael S. Tsirkin

On Sun, Mar 28, 2010 at 11:02:11AM +0300, Avi Kivity wrote:
 On 03/28/2010 10:47 AM, Michael S. Tsirkin wrote:

 Maybe irqcontrol could be extended?


 What's irqcontrol?
  
 uio accepts 32 bit writes to the char device file. We can encode
 the fd number there, and use the high bit to signal assign/deassign.


 Ugh.  Very unexpandable.

It currently fails on any non-4 byte write.
So if we need more bits in the future we can always teach it
about e.g. 8 byte writes.

Do you think it's worth it doing it now already, and using
8 byte writes for msi mapping?


 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-28 Thread Michael S. Tsirkin

On Sun, Mar 28, 2010 at 12:45:02PM +0300, Avi Kivity wrote:
 On 03/28/2010 12:40 PM, Michael S. Tsirkin wrote:
 uio accepts 32 bit writes to the char device file. We can encode
 the fd number there, and use the high bit to signal assign/deassign.


 Ugh.  Very unexpandable.
  
 It currently fails on any non-4 byte write.
 So if we need more bits in the future we can always teach it
 about e.g. 8 byte writes.

 Do you think it's worth it doing it now already, and using
 8 byte writes for msi mapping?


 Aren't ioctls a lot simpler?

 Multiplexing multiple functions on write()s is just ioctls done uglier.

I don't have an opinion here.

Writes do have an advantage that strace can show the buffer
content without being patched.

Further, something along the lines proposed above means that
we do not need to depend in a system header to get
the functionality.

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-03-31 Thread Michael S. Tsirkin

On Mon, Mar 29, 2010 at 11:59:24PM +0300, Avi Kivity wrote:
 On 03/28/2010 10:48 PM, Cam Macdonell wrote:
 On Sat, Mar 27, 2010 at 11:48 AM, Avi Kivitya...@redhat.com  wrote:

 On 03/26/2010 07:14 PM, Cam Macdonell wrote:
  

 I'm not familiar with the uio internals, but for the interface, an
 ioctl()
 on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd,
 but
 instead of mapping a doorbell to an eventfd, it maps a real MSI to an
 eventfd.

  
 uio will never support ioctls.

 Why not?
  
 Perhaps I spoke too strongly, but it was rejected before

 http://thread.gmane.org/gmane.linux.kernel/756481

 With a compelling case perhaps it could be added.


 Ah, the usual ioctls are ugly, go away.

 It could be done via sysfs:

   $ cat /sys/.../msix/max-interrupts
   256

This one can be discovered with existing sysfs.

   $ echo 4  /sys/.../msix/allocate
   $ # subdirectories 0 1 2 3 magically appear
   $ # bind fd 13 to msix

There's no way to know, when qemu starts, how many vectors will be used
by driver.  So I think we can just go ahead and allocate as many vectors
as supported by device at the moment when the first eventfd is bound.

   $ echo 13  /sys/.../msix/2/bind-fd

I think that this one can't work, both fd 13 and sysfs file will get
closed when /bin/echo process exits.

   $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13

 Call me old fashioned, but I prefer ioctls.

I think we will have to use ioctl or irqcontrol because of lifetime
issues outlines above.

 -- 
 Do not meddle in the internals of kernels, for they are subtle and quick to 
 panic.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: Increase NR_IOBUS_DEVS limit to 200

2010-03-31 Thread Michael S. Tsirkin

On Tue, Mar 30, 2010 at 04:48:25PM -0700, Sridhar Samudrala wrote:
 This patch increases the current hardcoded limit of NR_IOBUS_DEVS
 from 6 to 200. We are hitting this limit when creating a guest with more
 than 1 virtio-net device using vhost-net backend. Each virtio-net
 device requires 2 such devices to service notifications from rx/tx queues.
 
 Signed-off-by: Sridhar Samudrala s...@us.ibm.com
 

I tried this, but observed a measurable performance
degradation with vhost net and this patch. Did not
investigate yet.  Do you see this as well?

 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index a3fd0f9..7fb48d3 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -54,7 +54,7 @@ extern struct kmem_cache *kvm_vcpu_cache;
   */
  struct kvm_io_bus {
   int   dev_count;
 -#define NR_IOBUS_DEVS 6
 +#define NR_IOBUS_DEVS 200 
   struct kvm_io_device *devs[NR_IOBUS_DEVS];
  };
  
 
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH][QEMU][VHOST]fix feature bit handling for mergeable rx buffers

2010-03-31 Thread Michael S. Tsirkin

On Tue, Mar 30, 2010 at 12:21:05PM -0700, David L Stevens wrote:
 
 This patch adds mergeable receive buffer support to qemu-kvm,
 to allow enabling it when vhost_net supports it.
 
 It also adds a missing call to vhost_net_ack_features() to
 push acked features to vhost_net.
 
 The patch is relative to Michael Tsirkin's qemu-kvm git tree.
 
   +-DLS
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

Patch is well-formed, thanks!
BTW, where qemu-km and qemu share code, it is better
to send patch against qemu.

Since qemu-kvm generally carries kernel headers it needs, it might also
be a good idea to add if_tun to the kvm header directory.

 diff -ruNp qemu-kvm.mst/hw/vhost_net.c qemu-kvm.dls/hw/vhost_net.c
 --- qemu-kvm.mst/hw/vhost_net.c   2010-03-03 13:39:07.0 -0800
 +++ qemu-kvm.dls/hw/vhost_net.c   2010-03-29 20:37:34.0 -0700
 @@ -5,6 +5,7 @@
  #include sys/ioctl.h
  #include linux/vhost.h
  #include linux/virtio_ring.h
 +#include linux/if_tun.h
  #include netpacket/packet.h
  #include net/ethernet.h
  #include net/if.h
 @@ -38,15 +39,6 @@ unsigned vhost_net_get_features(struct v
   return features;
  }
  
 -void vhost_net_ack_features(struct vhost_net *net, unsigned features)
 -{
 - net-dev.acked_features = net-dev.backend_features;
 - if (features  (1  VIRTIO_F_NOTIFY_ON_EMPTY))
 - net-dev.acked_features |= (1  VIRTIO_F_NOTIFY_ON_EMPTY);
 - if (features  (1  VIRTIO_RING_F_INDIRECT_DESC))
 - net-dev.acked_features |= (1  VIRTIO_RING_F_INDIRECT_DESC);
 -}
 -
  static int vhost_net_get_fd(VLANClientState *backend)
  {
   switch (backend-info-type) {
 @@ -58,6 +50,25 @@ static int vhost_net_get_fd(VLANClientSt
   }
  }
  
 +void vhost_net_ack_features(struct vhost_net *net, unsigned features)
 +{
 + int vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 +
 + net-dev.acked_features = net-dev.backend_features;
 + if (features  (1  VIRTIO_F_NOTIFY_ON_EMPTY))
 + net-dev.acked_features |= (1  VIRTIO_F_NOTIFY_ON_EMPTY);
 + if (features  (1  VIRTIO_RING_F_INDIRECT_DESC))
 + net-dev.acked_features |= (1  VIRTIO_RING_F_INDIRECT_DESC);
 + if (features  (1  VIRTIO_NET_F_MRG_RXBUF)) {
 + net-dev.acked_features |= (1  VIRTIO_NET_F_MRG_RXBUF);
 + vnet_hdr_sz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 + }
 +#ifdef TUNSETVNETHDRSZ
 + if (ioctl(vhost_net_get_fd(net-vc), TUNSETVNETHDRSZ, vnet_hdr_sz)  0)
 + perror(TUNSETVNETHDRSZ);
 +#endif /* TUNSETVNETHDRSZ */

Note that this will break injecting raw packets after migration
(that code assumes header size is sizeof(struct virtio_net_hdr)).
Need to fix it.

So I think we are better off adding tap_set_vnet_header_size.
That function would:
- Remember current value, return success and do nothing
  if value is not changed.
- Try to change and return error if not.
- To get doubly sure, we can call TUNSETVNETHDRSZ with
  sizeof(struct virtio_net_hdr) at startup. If we get success
  once we should further on as well, otherwise we can exit.

An additional benefit is that tap driver will stay
localized in one file.

 +}
 +
  struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
  {
   int r;
 diff -ruNp qemu-kvm.mst/hw/virtio-net.c qemu-kvm.dls/hw/virtio-net.c
 --- qemu-kvm.mst/hw/virtio-net.c  2010-03-03 13:39:07.0 -0800
 +++ qemu-kvm.dls/hw/virtio-net.c  2010-03-29 16:15:46.0 -0700
 @@ -211,12 +211,16 @@ static void virtio_net_set_features(Virt
  n-mergeable_rx_bufs = !!(features  (1  VIRTIO_NET_F_MRG_RXBUF));
  
  if (n-has_vnet_hdr) {
 + struct vhost_net *vhost_net = tap_get_vhost_net(n-nic-nc.peer);
 +
  tap_set_offload(n-nic-nc.peer,
  (features  VIRTIO_NET_F_GUEST_CSUM)  1,
  (features  VIRTIO_NET_F_GUEST_TSO4)  1,
  (features  VIRTIO_NET_F_GUEST_TSO6)  1,
  (features  VIRTIO_NET_F_GUEST_ECN)   1,
  (features  VIRTIO_NET_F_GUEST_UFO)   1);
 + if (vhost_net)
 + vhost_net_ack_features(vhost_net, features);


Good catch, thanks! I'll apply this separately.

  }
  }
  
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net

2010-03-31 Thread Michael S. Tsirkin

On Tue, Mar 30, 2010 at 07:23:48PM -0600, David Stevens wrote:
 This patch adds support for the Mergeable Receive Buffers feature to
 vhost_net.
 
 Changes:
 1) generalize descriptor allocation functions to allow multiple
 descriptors per packet
 2) add socket peek to know datalen at buffer allocation time
 3) change notification to enable a multi-buffer max packet, rather
 than the single-buffer run until completely empty
 
 Changes from previous revision:
 1) incorporate review comments from Michael Tsirkin
 2) assume use of TUNSETVNETHDRSZ ioctl by qemu, which simplifies vnet 
 header
 processing
 3) fixed notification code to only affect the receive side
 
 Signed-Off-By: David L Stevens dlstev...@us.ibm.com
 
 [in-line for review, attached for applying w/o whitespace mangling]

attached patch seems to be whiespace damaged as well.
Does the origin pass checkpatch.pl for you?

 diff -ruNp net-next-p0/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
 --- net-next-p0/drivers/vhost/net.c 2010-03-22 12:04:38.0 
 -0700
 +++ net-next-p3/drivers/vhost/net.c 2010-03-30 12:50:57.0 
 -0700
 @@ -54,26 +54,6 @@ struct vhost_net {
 enum vhost_net_poll_state tx_poll_state;
  };
  
 -/* Pop first len bytes from iovec. Return number of segments used. */
 -static int move_iovec_hdr(struct iovec *from, struct iovec *to,
 - size_t len, int iov_count)
 -{
 -   int seg = 0;
 -   size_t size;
 -   while (len  seg  iov_count) {
 -   size = min(from-iov_len, len);
 -   to-iov_base = from-iov_base;
 -   to-iov_len = size;
 -   from-iov_len -= size;
 -   from-iov_base += size;
 -   len -= size;
 -   ++from;
 -   ++to;
 -   ++seg;
 -   }
 -   return seg;
 -}
 -
  /* Caller must have TX VQ lock */
  static void tx_poll_stop(struct vhost_net *net)
  {
 @@ -97,7 +77,8 @@ static void tx_poll_start(struct vhost_n
  static void handle_tx(struct vhost_net *net)
  {
 struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_TX];
 -   unsigned head, out, in, s;
 +   unsigned out, in;
 +   struct iovec head;
 struct msghdr msg = {
 .msg_name = NULL,
 .msg_namelen = 0,
 @@ -108,8 +89,8 @@ static void handle_tx(struct vhost_net *
 };
 size_t len, total_len = 0;
 int err, wmem;
 -   size_t hdr_size;
 struct socket *sock = rcu_dereference(vq-private_data);
 +
 if (!sock)
 return;
  
 @@ -127,22 +108,19 @@ static void handle_tx(struct vhost_net *
  
 if (wmem  sock-sk-sk_sndbuf / 2)
 tx_poll_stop(net);
 -   hdr_size = vq-hdr_size;
  
 for (;;) {
 -   head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -ARRAY_SIZE(vq-iov),
 -out, in,
 -NULL, NULL);
 +   head.iov_base = (void *)vhost_get_vq_desc(net-dev, vq,
 +   vq-iov, ARRAY_SIZE(vq-iov), out, in, NULL, 
 NULL);

I this casting confusing.
Is it really expensive to add an array of heads so that
we do not need to cast?


 /* Nothing new?  Wait for eventfd to tell us they 
 refilled. */
 -   if (head == vq-num) {
 +   if (head.iov_base == (void *)vq-num) {
 wmem = atomic_read(sock-sk-sk_wmem_alloc);
 if (wmem = sock-sk-sk_sndbuf * 3 / 4) {
 tx_poll_start(net, sock);
 set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
 break;
 }
 -   if (unlikely(vhost_enable_notify(vq))) {
 +   if (unlikely(vhost_enable_notify(vq, 0))) {
 vhost_disable_notify(vq);
 continue;
 }
 @@ -154,27 +132,30 @@ static void handle_tx(struct vhost_net *
 break;
 }
 /* Skip header. TODO: support TSO. */
 -   s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
 msg.msg_iovlen = out;
 -   len = iov_length(vq-iov, out);
 +   head.iov_len = len = iov_length(vq-iov, out);
 +
 /* Sanity check */
 if (!len) {
 -   vq_err(vq, Unexpected header len for TX: 
 -  %zd expected %zd\n,
 -  iov_length(vq-hdr, s), hdr_size);
 +   vq_err(vq, Unexpected buffer len for TX: %zd , 
 len);
 break;
 }
 -   /* TODO: Check specific error and bomb out unless ENOBUFS? 
 */
 err =

Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-01 Thread Michael S. Tsirkin

On Thu, Apr 01, 2010 at 05:27:18PM +0800, Xin Xiaohui wrote:
 Add a device to utilize the vhost-net backend driver for
 copy-less data transfer between guest FE and host NIC.
 It pins the guest user space to the host memory and
 provides proto_ops as sendmsg/recvmsg to vhost-net.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 Signed-off-by: Zhao Yu yzha...@gmail.com
 Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org
 ---
 
 Micheal,
 Sorry, I did not resolve all your comments this time.
 I did not move the device out of vhost directory because I
 did not implement real asynchronous read/write operations
 to mp device for now, We wish we can do this after the network
 code checked in. 

Well, placement of code is not such a major issue.
It's just that code under drivers/net gets more and better
review than drivers/vhost. I'll try to get Dave's opinion.

 
 For the DOS issue, I'm not sure how much the limit get_user_pages()
 can pin is reasonable, should we compute the bindwidth to make it?

There's a ulimit for locked memory. Can we use this, decreasing
the value for rlimit array? We can do this when backend is
enabled and re-increment when backend is disabled.

 We use get_user_pages_fast() and use set_page_dirty_lock().
 Remove read_rcu_lock()/unlock(), since the ctor pointer is
 only changed by BIND/UNBIND ioctl, and during that time,
 the NIC is always stoped, all outstanding requests are done,
 so the ctor pointer cannot be raced into wrong condition.
 
 Qemu needs a userspace write, is that a synchronous one or
 asynchronous one?

It's a synchronous non-blocking write.

 Thanks
 Xiaohui
 
  drivers/vhost/Kconfig |5 +
  drivers/vhost/Makefile|2 +
  drivers/vhost/mpassthru.c | 1162 
 +
  include/linux/mpassthru.h |   29 ++
  4 files changed, 1198 insertions(+), 0 deletions(-)
  create mode 100644 drivers/vhost/mpassthru.c
  create mode 100644 include/linux/mpassthru.h
 
 diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
 index 9f409f4..ee32a3b 100644
 --- a/drivers/vhost/Kconfig
 +++ b/drivers/vhost/Kconfig
 @@ -9,3 +9,8 @@ config VHOST_NET
 To compile this driver as a module, choose M here: the module will
 be called vhost_net.
  
 +config VHOST_PASSTHRU
 + tristate Zerocopy network driver (EXPERIMENTAL)
 + depends on VHOST_NET
 + ---help---
 +   zerocopy network I/O support
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index 72dd020..3f79c79 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -1,2 +1,4 @@
  obj-$(CONFIG_VHOST_NET) += vhost_net.o
  vhost_net-y := vhost.o net.o
 +
 +obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
 diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
 new file mode 100644
 index 000..6e8fc4d
 --- /dev/null
 +++ b/drivers/vhost/mpassthru.c
 @@ -0,0 +1,1162 @@
 +/*
 + *  MPASSTHRU - Mediate passthrough device.
 + *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
 + *
 + *  This program is free software; you can redistribute it and/or modify
 + *  it under the terms of the GNU General Public License as published by
 + *  the Free Software Foundation; either version 2 of the License, or
 + *  (at your option) any later version.
 + *
 + *  This program is distributed in the hope that it will be useful,
 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 + *  GNU General Public License for more details.
 + *
 + */
 +
 +#define DRV_NAMEmpassthru
 +#define DRV_DESCRIPTION Mediate passthru device driver
 +#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
 +
 +#include linux/module.h
 +#include linux/errno.h
 +#include linux/kernel.h
 +#include linux/major.h
 +#include linux/slab.h
 +#include linux/smp_lock.h
 +#include linux/poll.h
 +#include linux/fcntl.h
 +#include linux/init.h
 +#include linux/aio.h
 +
 +#include linux/skbuff.h
 +#include linux/netdevice.h
 +#include linux/etherdevice.h
 +#include linux/miscdevice.h
 +#include linux/ethtool.h
 +#include linux/rtnetlink.h
 +#include linux/if.h
 +#include linux/if_arp.h
 +#include linux/if_ether.h
 +#include linux/crc32.h
 +#include linux/nsproxy.h
 +#include linux/uaccess.h
 +#include linux/virtio_net.h
 +#include linux/mpassthru.h
 +#include net/net_namespace.h
 +#include net/netns/generic.h
 +#include net/rtnetlink.h
 +#include net/sock.h
 +
 +#include asm/system.h
 +
 +#include vhost.h
 +
 +/* Uncomment to enable debugging */
 +/* #define MPASSTHRU_DEBUG 1 */
 +
 +#ifdef MPASSTHRU_DEBUG
 +static int debug;
 +
 +#define DBG  if (mp-debug) printk
 +#define DBG1 if (debug == 2) printk
 +#else
 +#define DBG(a...)
 +#define DBG1(a...)
 +#endif
 +
 +#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
 +#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
 +
 +struct frag {
 + u16 offset;
 + u16 size;
 +};
 +
 +struct page_ctor {
 +

Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net

2010-04-01 Thread Michael S. Tsirkin

On Wed, Mar 31, 2010 at 03:04:43PM -0700, David Stevens wrote:
 Michael S. Tsirkin m...@redhat.com wrote on 03/31/2010 05:02:28 AM:
  
  attached patch seems to be whiespace damaged as well.
  Does the origin pass checkpatch.pl for you?
 
 Yes, but I probably broke it in the transfer -- will be more
 careful with the next revision.
 
 
 
   +   head.iov_base = (void *)vhost_get_vq_desc(net-dev, 
 vq,
   +   vq-iov, ARRAY_SIZE(vq-iov), out, in, NULL, 
 
   NULL);
  
  I this casting confusing.
  Is it really expensive to add an array of heads so that
  we do not need to cast?
 
 It needs the heads and the lengths, which looks a lot
 like an iovec. I was trying to resist adding a new
 struct XXX { unsigned head; unsigned len; } just for this,
 but I could make these parallel arrays, one with head index and
 the other with length.
 
   +   if (vq-rxmaxheadcount  headcount)
   +   vq-rxmaxheadcount = headcount;
  
  This seems the only place where we set the rxmaxheadcount
  value. Maybe it can be moved out of vhost.c to net.c?
  If vhost library needs this it can get it as function
  parameter.
 
 I can move it to vhost_get_heads(), sure.
  
   +   /* Skip header. TODO: support TSO. */
  
  You seem to have removed the code that skips the header.
  Won't this break non-GSO backends such as raw?
 
 From our prior discussion, what I had in mind was that
 we'd remove all special-header processing by using the ioctl
 you added for TUN and later, an equivalent ioctl for raw (when
 supported in qemu-kvm). Qemu would arrange headers with tap
 (and later raw) to get what the guest expects, and vhost then
 just passes all data as-is between the socket and the ring.
 That not only removes the different-header-len code
 for mergeable buffers, but also eliminates making a copy of the
 header for every packet as before.

Yes but please note that in practice if this is what we do,
then vhost header size is 0 and then there is no actual copy.

 Vhost has no need to do
 anything with the header, or even know its length. It also
 doesn't have to care what the type of the backend is-- raw or
 tap.
 I think that simplifies everything, but it does mean that
 raw socket support requires a header ioctl for the different
 vnethdr sizes if the guest wants a vnethdr with and without
 mergeable buffers. Actually, I thought this was your idea and
 the point of the TUNSETVNETHDRSZ ioctl, but I guess you had
 something else in mind?

Yes. However, we also have an ioctl that sets vhost header size, and it
allows supporting simple backends which do not support an (equivalent
of) TUNSETVNETHDRSZ. We have two of these now: packet and macvtap.
So I thought that unless there are gains in code simplicity and/or
performance we can keep supporting these just as well.

   -   /* TODO: Check specific error and bomb out unless 
 EAGAIN? 
   */
  
  Do you think it's not a good idea?
 
 EAGAIN is not possible after the change, because we don't
 even enter the loop unless we have an skb on the read queue; the
 other cases bomb out, so I figured the comment for future work is
 now done. :-)

Guest could be buggy so we'll get EFAULT.
If skb is taken off the rx queue (as below), we might get EAGAIN.

  
   if (err  0) {
   -   vhost_discard_vq_desc(vq);
   +   vhost_discard_vq_desc(vq, headcount);
   break;
   }
  
  I think we should detect and discard truncated messages,
  since len might not be reliable if userspace pulls
  a packet from under us. Also, if new packet is
  shorter than the old one, there's no truncation but
  headcount is wrong.
  
  So simplest fix IMO would be to compare err with expected len.
  If there's a difference, we hit the race and so
  we would discard the packet.
 
 Sure.
 
  
  
   /* TODO: Should check and handle checksum. */
   +   if (vhost_has_feature(net-dev, 
 VIRTIO_NET_F_MRG_RXBUF)) 
   {
   +   struct virtio_net_hdr_mrg_rxbuf *vhdr =
   +   (struct virtio_net_hdr_mrg_rxbuf *)
   +   vq-iov[0].iov_base;
   +   /* add num_bufs */
   +   if (put_user(headcount, vhdr-num_buffers)) {
   +   vq_err(vq, Failed to write 
 num_buffers);
   +   vhost_discard_vq_desc(vq, headcount);
  
  Let's do memcpy_to_iovecend etc so that we do not assume layout.
  This is also why we need move_iovec: sendmsg might modify the iovec.
  It would also be nice not to corrupt memory and
  get a reasonable error if buffer size
  that we get is smaller than expected header size.
 
 I wanted to avoid the header copy here, with the reasonable 
 restriction
 that the guest gives you a buffer at least big

Re: [PATCH 0/1] uio_pci_generic: extensions to allow access for non-privileged processes

2010-04-01 Thread Michael S. Tsirkin

On Wed, Mar 31, 2010 at 05:08:38PM -0700, Tom Lyon wrote:
 uio_pci_generic has previously been discussed on the KVM list, but this patch 
 has nothing to do with KVM, so it is also going to LKML.
 
 The point of this patch is to beef up the uio_pci_generic driver so that a 
 non-privileged user process can run a user level driver for most PCIe 
 devices. This can only be safe if there is an IOMMU in the system with 
 per-device domains.

Why? Per-guest domain should be safe enough.

  Privileged users (CAP_SYS_RAWIO) are allowed if there is 
 no IOMMU.

qemu does not support it, I doubt this last option is worth having.

 Specifically, I seek to allow low-latency user level network drivers (non 
 tcp/ip) which directly access SR-IOV style virtual network adapters, for use 
 with packages such as OpenMPI.
 
 Key areas of change:
 - ioctl extensions to allow registration and dma mapping of memory regions, 
 with lock accounting
 - support for mmu notifier driven de-mapping
 - support for MSI and MSI-X interrupts (the intel 82599 VFs support only 
 MSI-X)
 - allowing interrupt enabling and device register mapping all 
 through /dev/uio* so that  permissions may be granted just by chmod 
 on /dev/uio*

For non-priveledged users, we need a way to enforce that
device is bound to an iommu.

Further, locking really needs to be scoped with iommu domain existance
and with iommu mappings: as long as a page is mapped in iommu,
it must be locked. This patch does not seem to enforce that.

Also note that what we really want is a single iommu domain per guest,
not per device.

For this reason, I think we should address the problem somwwhat
differently:
- Create a character device to represent the iommu
- This device will handle memory locking etc
- Allow binding this device to iommu
- Allow other operations only after iommu is bound

Thanks!

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/1] Shared memory uio_pci driver

2010-04-01 Thread Michael S. Tsirkin

On Thu, Apr 01, 2010 at 11:58:29AM +0300, Avi Kivity wrote:
 On 03/31/2010 12:12 PM, Michael S. Tsirkin wrote:


$ echo 4  /sys/.../msix/allocate
$ # subdirectories 0 1 2 3 magically appear
$ # bind fd 13 to msix
  
 There's no way to know, when qemu starts, how many vectors will be used
 by driver.  So I think we can just go ahead and allocate as many vectors
 as supported by device at the moment when the first eventfd is bound.


 That will cause a huge amount of vectors to be allocated.  It's better  
 to do this dynamically in response to guest programming.

guest unmasks vectors one by one.
Linux does not have an API to allocate vectors one by one now.

$ echo 13  /sys/.../msix/2/bind-fd
  
 I think that this one can't work, both fd 13 and sysfs file will get
 closed when /bin/echo process exits.


 I meant figuratively.  It's pointless to bind an eventfd from a shell.   
 You'd use fprintf() from qemu to bind the eventfd.

$ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13

 Call me old fashioned, but I prefer ioctls.
  
 I think we will have to use ioctl or irqcontrol because of lifetime
 issues outlines above.


 The lifetime issues don't exist if you do all that from qemu.  That's  
 not to say I prefer sysfs to ioctl.

 What's irqcontrol?

uio core accepts 32 bit writes and passes the value written
as int to an irqcontrol callback in the device.

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Michael S. Tsirkin

On Thu, Apr 01, 2010 at 05:14:56PM +0800, Xin Xiaohui wrote:
 The vhost-net backend now only supports synchronous send/recv
 operations. The patch provides multiple submits and asynchronous
 notifications. This is needed for zero-copy case.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 ---
 
 Michael,
 Now, I made vhost to alloc/destroy the kiocb, and transfer it from 
 sendmsg/recvmsg. I did not remove vq-receiver, since what the
 callback does is related to the structures owned by mp device,
 and I think isolation them to vhost is a good thing to us all.
 And it will not prevent mp device to be independent of vhost 
 in future. Later, when mp device can be a real device which
 provides asynchronous read/write operations and not just report
 proto_ops, it will use another callback function which is not
 related to vhost at all.

Thanks, I'll look at the code!

 For the write logging, do you have a function in hand that we can
 recompute the log? If that, I think I can use it to recompute the
 log info when the logging is suddenly enabled.
 For the outstanding requests, do you mean all the user buffers have
 submitted before the logging ioctl changed? That may be a lot, and
 some of them are still in NIC ring descriptors. Waiting them to be
 finished may be need some time. I think when logging ioctl changed,
 then the logging is changed just after that is also reasonable.

The key point is that after loggin ioctl returns, any
subsequent change to memory must be logged. It does not
matter when was the request submitted, otherwise we will
get memory corruption on migration.

 Thanks
 Xiaohui
 
  drivers/vhost/net.c   |  189 
 +++--
  drivers/vhost/vhost.h |   10 +++
  2 files changed, 192 insertions(+), 7 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 22d5fef..2aafd90 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -17,11 +17,13 @@
  #include linux/workqueue.h
  #include linux/rcupdate.h
  #include linux/file.h
 +#include linux/aio.h
  
  #include linux/net.h
  #include linux/if_packet.h
  #include linux/if_arp.h
  #include linux/if_tun.h
 +#include linux/mpassthru.h
  
  #include net/sock.h
  
 @@ -47,6 +49,7 @@ struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 + struct kmem_cache   *cache;
   /* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
 @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
   net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + unsigned long flags;
 +
 + spin_lock_irqsave(vq-notify_lock, flags);
 + if (!list_empty(vq-notifier)) {
 + iocb = list_first_entry(vq-notifier,
 + struct kiocb, ki_list);
 + list_del(iocb-ki_list);
 + }
 + spin_unlock_irqrestore(vq-notify_lock, flags);
 + return iocb;
 +}
 +
 +static void handle_async_rx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + struct vhost_log *vq_log = NULL;
 + int rx_total_len = 0;
 + int log, size;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + if (vq-receiver)
 + vq-receiver(vq);
 +
 + vq_log = unlikely(vhost_has_feature(
 + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, iocb-ki_nbytes);
 + log = (int)iocb-ki_user_data;
 + size = iocb-ki_nbytes;
 + rx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 + kmem_cache_free(net-cache, iocb);
 +
 + if (unlikely(vq_log))
 + vhost_log_write(vq, vq_log, log, size);
 + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
 + vhost_poll_queue(vq-poll);
 + break;
 + }
 + }
 +}
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + int tx_total_len = 0;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, 0);
 + tx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 +
 +

Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net

2010-04-04 Thread Michael S. Tsirkin

On Thu, Apr 01, 2010 at 11:22:37AM -0700, David Stevens wrote:
 kvm-ow...@vger.kernel.org wrote on 04/01/2010 03:54:15 AM:
 
  On Wed, Mar 31, 2010 at 03:04:43PM -0700, David Stevens wrote:
 
   
 +   head.iov_base = (void 
 *)vhost_get_vq_desc(net-dev, 
   vq,
 +   vq-iov, ARRAY_SIZE(vq-iov), out, in, 
 NULL, 
   
 NULL);

I this casting confusing.
Is it really expensive to add an array of heads so that
we do not need to cast?
   
   It needs the heads and the lengths, which looks a lot
   like an iovec. I was trying to resist adding a new
   struct XXX { unsigned head; unsigned len; } just for this,
   but I could make these parallel arrays, one with head index and
   the other with length.
 
 Michael, on this one, if I add vq-heads as an argument to
 vhost_get_heads (aka vhost_get_desc_n), I'd need the length too.
 Would you rather this 1) remain an iovec (and a single arg added) but
 cast still there, 2) 2 arrays (head and length) and 2 args added, or
 3) a new struct type of {unsigned,int} to carry for the heads+len
 instead of iovec?
 My preference would be 1). I agree the casts are ugly, but
 it is essentially an iovec the way we use it; it's just that the
 base isn't a pointer but a descriptor index instead.

I prefer 2 or 3. If you prefer 1 strongly, I think we should
add a detailed comment near the iovec, and
a couple of inline wrappers to store/get data in the iovec.

   
   EAGAIN is not possible after the change, because we don't
   even enter the loop unless we have an skb on the read queue; the
   other cases bomb out, so I figured the comment for future work is
   now done. :-)
  
  Guest could be buggy so we'll get EFAULT.
  If skb is taken off the rx queue (as below), we might get EAGAIN.
 
 We break on any error. If we get EAGAIN because someone read
 on the socket, this code would break the loop, but EAGAIN is a more
 serious problem if it changed since we peeked (because it means
 someone else is reading the socket).
 But I don't understand -- are you suggesting that the error
 handling be different than that, or that the comment is still
 relevant?
 My intention here is to do the TODO from the comment
 so that it can be removed, by handling all error cases. I think
 because of the peek, EAGAIN isn't something to be ignored anymore,
 but the effect is the same whether we break out of the loop or
 not, since we retry the packet next time around. Essentially, we
 ignore every error since we will redo it with the same packet the
 next time around. Maybe we should print something here, but since
 we'll be retrying the packet that's still on the socket, a permanent
 error would spew continuously. Maybe we should shut down entirely
 if we get any negative return value here (including EAGAIN, since
 that tells us someone messed with the socket when we don't want them
 to).
 If you want the comment still there, ok, but I do think EAGAIN
 isn't a special case per the comment anymore, and is handled as all
 other errors are: by exiting the loop and retrying next time.
 
 +-DLS

Yes, I just think some comment should stay, as you say, because
otherwise we simply retry continuously. Maybe we should trigger vq_err.
It needs to be given some thought which I have not given it yet.

Thinking aloud, EAGAIN means someone reads the socket
together with us, I prefer that this condition is made a fatal
error, we should make sure we are polling the socket
so we see packets if more appear.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vhost: Make it more scalable by creating a vhost thread per device.

2010-04-04 Thread Michael S. Tsirkin

On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote:
 Make vhost scalable by creating a separate vhost thread per vhost
 device. This provides better scaling across multiple guests and with
 multiple interfaces in a guest.

Thanks for looking into this. An alternative approach is
to simply replace create_singlethread_workqueue with
create_workqueue which would get us a thread per host CPU.

It seems that in theory this should be the optimal approach
wrt CPU locality, however, in practice a single thread
seems to get better numbers. I have a TODO to investigate this.
Could you try looking into this?

 
 I am seeing better aggregated througput/latency when running netperf
 across multiple guests or multiple interfaces in a guest in parallel
 with this patch.

Any numbers? What happens to CPU utilization?

 Signed-off-by: Sridhar Samudrala s...@us.ibm.com
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index a6a88df..29aa80f 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -339,8 +339,10 @@ static int vhost_net_open(struct inode *inode, struct 
 file *f)
   return r;
   }
  
 - vhost_poll_init(n-poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
 - vhost_poll_init(n-poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
 + vhost_poll_init(n-poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT,
 + n-dev);
 + vhost_poll_init(n-poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN,
 + n-dev);
   n-tx_poll_state = VHOST_NET_POLL_DISABLED;
  
   f-private_data = n;
 @@ -643,25 +645,14 @@ static struct miscdevice vhost_net_misc = {
  
  int vhost_net_init(void)
  {
 - int r = vhost_init();
 - if (r)
 - goto err_init;
 - r = misc_register(vhost_net_misc);
 - if (r)
 - goto err_reg;
 - return 0;
 -err_reg:
 - vhost_cleanup();
 -err_init:
 - return r;
 -
 + return misc_register(vhost_net_misc);
  }
 +
  module_init(vhost_net_init);
  
  void vhost_net_exit(void)
  {
   misc_deregister(vhost_net_misc);
 - vhost_cleanup();
  }
  module_exit(vhost_net_exit);
  
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index 7bd7a1e..243f4d3 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -36,8 +36,6 @@ enum {
   VHOST_MEMORY_F_LOG = 0x1,
  };
  
 -static struct workqueue_struct *vhost_workqueue;
 -
  static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
   poll_table *pt)
  {
 @@ -56,18 +54,19 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned 
 mode, int sync,
   if (!((unsigned long)key  poll-mask))
   return 0;
  
 - queue_work(vhost_workqueue, poll-work);
 + queue_work(poll-dev-wq, poll-work);
   return 0;
  }
  
  /* Init poll structure */
  void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
 -  unsigned long mask)
 +  unsigned long mask, struct vhost_dev *dev)
  {
   INIT_WORK(poll-work, func);
   init_waitqueue_func_entry(poll-wait, vhost_poll_wakeup);
   init_poll_funcptr(poll-table, vhost_poll_func);
   poll-mask = mask;
 + poll-dev = dev;
  }
  
  /* Start polling a file. We add ourselves to file's wait queue. The caller 
 must
 @@ -96,7 +95,7 @@ void vhost_poll_flush(struct vhost_poll *poll)
  
  void vhost_poll_queue(struct vhost_poll *poll)
  {
 - queue_work(vhost_workqueue, poll-work);
 + queue_work(poll-dev-wq, poll-work);
  }
  
  static void vhost_vq_reset(struct vhost_dev *dev,
 @@ -128,6 +127,11 @@ long vhost_dev_init(struct vhost_dev *dev,
   struct vhost_virtqueue *vqs, int nvqs)
  {
   int i;
 +
 + dev-wq = create_singlethread_workqueue(vhost);
 + if (!dev-wq)
 + return -ENOMEM;
 +
   dev-vqs = vqs;
   dev-nvqs = nvqs;
   mutex_init(dev-mutex);
 @@ -143,7 +147,7 @@ long vhost_dev_init(struct vhost_dev *dev,
   if (dev-vqs[i].handle_kick)
   vhost_poll_init(dev-vqs[i].poll,
   dev-vqs[i].handle_kick,
 - POLLIN);
 + POLLIN, dev);
   }
   return 0;
  }
 @@ -216,6 +220,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
   if (dev-mm)
   mmput(dev-mm);
   dev-mm = NULL;
 +
 + destroy_workqueue(dev-wq);
  }
  
  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 @@ -1095,16 +1101,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
   vq_err(vq, Failed to enable notification at %p: %d\n,
  vq-used-flags, r);
  }
 -
 -int vhost_init(void)
 -{
 - vhost_workqueue = create_singlethread_workqueue(vhost);
 - if (!vhost_workqueue)
 - return -ENOMEM;
 - return 0;
 -}
 -
 -void vhost_cleanup(void)
 -{
 - destroy_workqueue(vhost_workqueue);
 -}
 diff --git

Re: [PATCHv6 0/4] qemu-kvm: vhost net port

2010-04-04 Thread Michael S. Tsirkin

On Wed, Mar 24, 2010 at 02:38:57PM +0200, Avi Kivity wrote:
 On 03/17/2010 03:04 PM, Michael S. Tsirkin wrote:
 This is port of vhost v6 patch set I posted previously to qemu-kvm, for
 those that want to get good performance out of it :) This patchset needs
 to be applied when qemu.git one gets merged, this includes irqchip
 support.



 Ping me when this happens please.

Ping

 -- 
 error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-06 Thread Michael S. Tsirkin

On Tue, Apr 06, 2010 at 01:41:37PM +0800, Xin, Xiaohui wrote:
 Michael,
  
 For the DOS issue, I'm not sure how much the limit get_user_pages()
  can pin is reasonable, should we compute the bindwidth to make it?
 
 There's a ulimit for locked memory. Can we use this, decreasing
 the value for rlimit array? We can do this when backend is
 enabled and re-increment when backend is disabled.
 
 I have tried it with rlim[RLIMIT_MEMLOCK].rlim_cur, but I found
 the initial value for it is 0x1, after right shift PAGE_SHIFT,
 it's only 16 pages we can lock then, it seems too small, since the 
 guest virito-net driver may submit a lot requests one time.
 
 
 Thanks
 Xiaohui

Yes, that's the default, but system administrator can always increase
this value with ulimit if necessary.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-06 Thread Michael S. Tsirkin

On Tue, Apr 06, 2010 at 01:46:56PM +0800, Xin, Xiaohui wrote:
 Michael,
   For the write logging, do you have a function in hand that we can
   recompute the log? If that, I think I can use it to recompute the
  log info when the logging is suddenly enabled.
   For the outstanding requests, do you mean all the user buffers have
  submitted before the logging ioctl changed? That may be a lot, and
   some of them are still in NIC ring descriptors. Waiting them to be
  finished may be need some time. I think when logging ioctl changed,
   then the logging is changed just after that is also reasonable.
  
  The key point is that after loggin ioctl returns, any
  subsequent change to memory must be logged. It does not
  matter when was the request submitted, otherwise we will
  get memory corruption on migration.
 
  The change to memory happens when vhost_add_used_and_signal(), right?
  So after ioctl returns, just recompute the log info to the events in the 
  async queue,
  is ok. Since the ioctl and write log operations are all protected by 
  vq-mutex.
  
  Thanks
  Xiaohui
 
 Yes, I think this will work.
 
 Thanks, so do you have the function to recompute the log info in your hand 
 that I can 
 use? I have weakly remembered that you have noticed it before some time.

Doesn't just rerunning vhost_get_vq_desc work?

   Thanks
   Xiaohui
   
drivers/vhost/net.c   |  189 
   +++--
drivers/vhost/vhost.h |   10 +++
2 files changed, 192 insertions(+), 7 deletions(-)
   
   diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
   index 22d5fef..2aafd90 100644
   --- a/drivers/vhost/net.c
   +++ b/drivers/vhost/net.c
   @@ -17,11 +17,13 @@
#include linux/workqueue.h
#include linux/rcupdate.h
#include linux/file.h
   +#include linux/aio.h

#include linux/net.h
#include linux/if_packet.h
#include linux/if_arp.h
#include linux/if_tun.h
   +#include linux/mpassthru.h

#include net/sock.h

   @@ -47,6 +49,7 @@ struct vhost_net {
 struct vhost_dev dev;
 struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 struct vhost_poll poll[VHOST_NET_VQ_MAX];
   + struct kmem_cache   *cache;
 /* Tells us whether we are polling a socket for TX.
  * We only do this when socket buffer fills up.
  * Protected by tx vq lock. */
   @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
   struct socket *sock)
 net-tx_poll_state = VHOST_NET_POLL_STARTED;
}

   +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + unsigned long flags;
   +
   + spin_lock_irqsave(vq-notify_lock, flags);
   + if (!list_empty(vq-notifier)) {
   + iocb = list_first_entry(vq-notifier,
   + struct kiocb, ki_list);
   + list_del(iocb-ki_list);
   + }
   + spin_unlock_irqrestore(vq-notify_lock, flags);
   + return iocb;
   +}
   +
   +static void handle_async_rx_events_notify(struct vhost_net *net,
   + struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + struct vhost_log *vq_log = NULL;
   + int rx_total_len = 0;
   + int log, size;
   +
   + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
   + return;
   +
   + if (vq-receiver)
   + vq-receiver(vq);
   +
   + vq_log = unlikely(vhost_has_feature(
   + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
   + while ((iocb = notify_dequeue(vq)) != NULL) {
   + vhost_add_used_and_signal(net-dev, vq,
   + iocb-ki_pos, iocb-ki_nbytes);
   + log = (int)iocb-ki_user_data;
   + size = iocb-ki_nbytes;
   + rx_total_len += iocb-ki_nbytes;
   +
   + if (iocb-ki_dtor)
   + iocb-ki_dtor(iocb);
   + kmem_cache_free(net-cache, iocb);
   +
   + if (unlikely(vq_log))
   + vhost_log_write(vq, vq_log, log, size);
   + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
   + vhost_poll_queue(vq-poll);
   + break;
   + }
   + }
   +}
   +
   +static void handle_async_tx_events_notify(struct vhost_net *net,
   + struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + int tx_total_len = 0;
   +
   + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
   + return;
   +
   + while ((iocb = notify_dequeue(vq)) != NULL) {
   + vhost_add_used_and_signal(net-dev, vq,
   + iocb-ki_pos, 0);
   + tx_total_len += iocb-ki_nbytes;
   +
   + if (iocb-ki_dtor)
   + iocb-ki_dtor(iocb);
   +
   + kmem_cache_free(net-cache, iocb);
   + if (unlikely(tx_total_len = VHOST_NET_WEIGHT)) {
   + vhost_poll_queue(vq-poll);
   + break;
   + }
   + }
   +}
   +
/* Expects to be always run from workqueue - which acts as
 * read-size

Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-07 Thread Michael S. Tsirkin

On Wed, Apr 07, 2010 at 10:41:08AM +0800, Xin, Xiaohui wrote:
 Michael,
  
  Qemu needs a userspace write, is that a synchronous one or
 asynchronous one?
 
 It's a synchronous non-blocking write.
 Sorry, why the Qemu live migration needs the device have a userspace write?
 how does the write operation work? And why a read operation is not cared here?
 
 Thanks
 Xiaohui

Roughly, with ethernet bridges, moving a device from one location in
the network to another makes forwarding tables incorrect (or incomplete),
until outgoing traffic from the device causes these tables
to be updated. Since there's no guarantee that guest
will generate outgoing traffic, after migration qemu sends out several
dummy packets itself.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-07 Thread Michael S. Tsirkin

On Wed, Apr 07, 2010 at 09:36:36AM +0800, Xin, Xiaohui wrote:
 Michael,
For the write logging, do you have a function in hand that we can
recompute the log? If that, I think I can use it to recompute the
   log info when the logging is suddenly enabled.
For the outstanding requests, do you mean all the user buffers have
   submitted before the logging ioctl changed? That may be a lot, and
some of them are still in NIC ring descriptors. Waiting them to be
   finished may be need some time. I think when logging ioctl changed,
then the logging is changed just after that is also reasonable.
 
   The key point is that after loggin ioctl returns, any
   subsequent change to memory must be logged. It does not
   matter when was the request submitted, otherwise we will
   get memory corruption on migration.
 
   The change to memory happens when vhost_add_used_and_signal(), right?
   So after ioctl returns, just recompute the log info to the events in 
   the async queue,
   is ok. Since the ioctl and write log operations are all protected by 
   vq-mutex.
 
   Thanks
   Xiaohui
 
  Yes, I think this will work.
 
  Thanks, so do you have the function to recompute the log info in your hand 
  that I can
 use? I have weakly remembered that you have noticed it before some time.
 
 Doesn't just rerunning vhost_get_vq_desc work?
 
 Am I missing something here?
 The vhost_get_vq_desc() looks in vq, and finds the first available buffers, 
 and converts it
 to an iovec. I think the first available buffer is not the buffers in the 
 async queue, so I
 think rerunning vhost_get_vq_desc() cannot work.
 
 Thanks
 Xiaohui

Right, but we can move the head back, so we'll find the same buffers
again, or add a variant of vhost_get_vq_desc that will process
descriptors already consumed.

Thanks
Xiaohui
   
 drivers/vhost/net.c   |  189 
+++--
 drivers/vhost/vhost.h |   10 +++
 2 files changed, 192 insertions(+), 7 deletions(-)
   
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..2aafd90 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include linux/workqueue.h
 #include linux/rcupdate.h
 #include linux/file.h
+#include linux/aio.h
   
 #include linux/net.h
 #include linux/if_packet.h
 #include linux/if_arp.h
 #include linux/if_tun.h
+#include linux/mpassthru.h
   
 #include net/sock.h
   
@@ -47,6 +49,7 @@ struct vhost_net {
  struct vhost_dev dev;
  struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
  struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache   *cache;
  /* Tells us whether we are polling a socket for TX.
   * We only do this when socket buffer fills up.
   * Protected by tx vq lock. */
@@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
struct socket *sock)
  net-tx_poll_state = VHOST_NET_POLL_STARTED;
 }
   
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(vq-notify_lock, flags);
+ if (!list_empty(vq-notifier)) {
+ iocb = list_first_entry(vq-notifier,
+ struct kiocb, ki_list);
+ list_del(iocb-ki_list);
+ }
+ spin_unlock_irqrestore(vq-notify_lock, flags);
+ return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ int log, size;
+
+ if (vq-link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ if (vq-receiver)
+ vq-receiver(vq);
+
+ vq_log = unlikely(vhost_has_feature(
+ net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(net-dev, vq,
+ iocb-ki_pos, iocb-ki_nbytes);
+ log = (int)iocb-ki_user_data;
+ size = iocb-ki_nbytes;
+ rx_total_len += iocb-ki_nbytes;
+
+ if (iocb-ki_dtor)
+ iocb-ki_dtor(iocb);
+ kmem_cache_free(net-cache, iocb);
+
+ if (unlikely(vq_log))
+ vhost_log_write(vq, vq_log, log, size);
+ if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(vq-poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (vq-link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+

Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net

2010-04-07 Thread Michael S. Tsirkin

On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
 
 This patch adds support for the Mergeable Receive Buffers feature to
 vhost_net.
 
   +-DLS
 
 Changes from previous revision:
 1) renamed:
   vhost_discard_vq_desc - vhost_discard_desc
   vhost_get_heads - vhost_get_desc_n
   vhost_get_vq_desc - vhost_get_desc
 2) added heads as argument to ghost_get_desc_n
 3) changed vq-heads from iovec to vring_used_elem, removed casts
 4) changed vhost_add_used to do multiple elements in a single
 copy_to_user,
   or two when we wrap the ring.
 5) removed rxmaxheadcount and available buffer checks in favor of
 running until
   an allocation failure, but making sure we break the loop if we get
   two in a row, indicating we have at least 1 buffer, but not enough
   for the current receive packet
 6) restore non-vnet header handling
 
 Signed-Off-By: David L Stevens dlstev...@us.ibm.com

Thanks!
There's some whitespace damage, are you sending with your new
sendmail setup? It seems to have worked for qemu patches ...

 diff -ruNp net-next-p0/drivers/vhost/net.c
 net-next-v3/drivers/vhost/net.c
 --- net-next-p0/drivers/vhost/net.c   2010-03-22 12:04:38.0 -0700
 +++ net-next-v3/drivers/vhost/net.c   2010-04-06 12:54:56.0 -0700
 @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
   hdr_size = vq-hdr_size;
  
   for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -  ARRAY_SIZE(vq-iov),
 -  out, in,
 + head = vhost_get_desc(net-dev, vq, vq-iov,
 +  ARRAY_SIZE(vq-iov), out, in,
NULL, NULL);
   /* Nothing new?  Wait for eventfd to tell us they refilled. */
   if (head == vq-num) {
 @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
   /* TODO: Check specific error and bomb out unless ENOBUFS? */
   err = sock-ops-sendmsg(NULL, sock, msg, len);
   if (unlikely(err  0)) {
 - vhost_discard_vq_desc(vq);
 - tx_poll_start(net, sock);
 + if (err == -EAGAIN) {
 + vhost_discard_desc(vq, 1);
 + tx_poll_start(net, sock);
 + } else {
 + vq_err(vq, sendmsg: errno %d\n, -err);
 + /* drop packet; do not discard/resend */
 + vhost_add_used_and_signal(net-dev, vq, head,
 +   0);

vhost does not currently has a consistent error handling strategy:
if we drop packets, need to think which other errors should cause
packet drops.  I prefer to just call vq_err for now,
and have us look at handling segfaults etc in a consistent way
separately.

 + }
   break;
   }
   if (err != len)
 @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
   unuse_mm(net-dev.mm);
  }
  
 +static int vhost_head_len(struct sock *sk)
 +{
 + struct sk_buff *head;
 + int len = 0;
 +
 + lock_sock(sk);
 + head = skb_peek(sk-sk_receive_queue);
 + if (head)
 + len = head-len;
 + release_sock(sk);
 + return len;
 +}
 +

I wonder whether it makes sense to check
skb_queue_empty(sk-sk_receive_queue)
outside the lock, to reduce the cost of this call
on an empty queue (we know that it happens at least once
each time we exit the loop on rx)?

  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_rx(struct vhost_net *net)
  {
   struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_RX];
 - unsigned head, out, in, log, s;
 + unsigned in, log, s;
   struct vhost_log *vq_log;
   struct msghdr msg = {
   .msg_name = NULL,
 @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
   .msg_flags = MSG_DONTWAIT,
   };
  
 - struct virtio_net_hdr hdr = {
 - .flags = 0,
 - .gso_type = VIRTIO_NET_HDR_GSO_NONE
 + struct virtio_net_hdr_mrg_rxbuf hdr = {
 + .hdr.flags = 0,
 + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
   };
  
 + int retries = 0;
   size_t len, total_len = 0;
 - int err;
 + int err, headcount, datalen;
   size_t hdr_size;
   struct socket *sock = rcu_dereference(vq-private_data);
   if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
 @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
   vq-log : NULL;
  
 - for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -

Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-07 Thread Michael S. Tsirkin

On Wed, Apr 07, 2010 at 05:00:39PM +0800, xiaohui@intel.com wrote:
 From: Xin Xiaohui xiaohui@intel.com
 
 ---
 
 Michael,
 Thanks a lot for the explanation. I have drafted a patch for the qemu write
 after I looked into tun driver. Does it do in right way?
 
 Thanks
 Xiaohui
 
  drivers/vhost/mpassthru.c |   45 
 +
  1 files changed, 45 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
 index e9449ac..1cde097 100644
 --- a/drivers/vhost/mpassthru.c
 +++ b/drivers/vhost/mpassthru.c
 @@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, 
 poll_table * wait)
   return mask;
  }
  
 +static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
 + unsigned long count, loff_t pos)
 +{
 + struct file *file = iocb-ki_filp;
 + struct mp_struct *mp = mp_get(file-private_data);
 + struct sock *sk = mp-socket.sk;
 + struct sk_buff *skb;
 + int len, err;
 + ssize_t result;
 +
 + if (!mp)
 + return -EBADFD;
 +

Can this happen? When?

 + /* currently, async is not supported */
 + if (!is_sync_kiocb(iocb))
 + return -EFAULT;

Really necessary? I think do_sync_write handles all this.

 +
 + len = iov_length(iov, count);
 + skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
 +   file-f_flags  O_NONBLOCK, err);
 +
 + if (!skb)
 + return -EFAULT;

Surely not EFAULT. -EAGAIN?

 +
 + skb_reserve(skb, NET_IP_ALIGN);
 + skb_put(skb, len);
 +
 + if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
 + kfree_skb(skb);
 + return -EFAULT;
 + }
 + skb_set_network_header(skb, ETH_HLEN);

Is this really right or necessary? Also,
probably need to check that length is at least ETH_ALEN before
doing this.

 + skb-protocol = *((__be16 *)(skb-data) + ETH_ALEN);

eth_type_trans?

 + skb-dev = mp-dev;
 +
 + dev_queue_xmit(skb);
 + mp-dev-stats.tx_packets++;
 + mp-dev-stats.tx_bytes += len;

Doesn't the hard start xmit function for the device
increment the counters?

 +
 + mp_put(mp);
 + return result;
 +}
 +
  static int mp_chr_close(struct inode *inode, struct file *file)
  {
   struct mp_file *mfile = file-private_data;
 @@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct 
 file *file)
  static const struct file_operations mp_fops = {
   .owner  = THIS_MODULE,
   .llseek = no_llseek,
 + .write  = do_sync_write,
 + .aio_write = mp_chr_aio_write,
   .poll   = mp_chr_poll,
   .unlocked_ioctl = mp_chr_ioctl,
   .open   = mp_chr_open,
 -- 
 1.5.4.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net

2010-04-07 Thread Michael S. Tsirkin

Some corrections:

On Wed, Apr 07, 2010 at 01:59:10PM +0300, Michael S. Tsirkin wrote:
 On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
  
  This patch adds support for the Mergeable Receive Buffers feature to
  vhost_net.
  
  +-DLS
  
  Changes from previous revision:
  1) renamed:
  vhost_discard_vq_desc - vhost_discard_desc
  vhost_get_heads - vhost_get_desc_n
  vhost_get_vq_desc - vhost_get_desc
  2) added heads as argument to ghost_get_desc_n
  3) changed vq-heads from iovec to vring_used_elem, removed casts
  4) changed vhost_add_used to do multiple elements in a single
  copy_to_user,
  or two when we wrap the ring.
  5) removed rxmaxheadcount and available buffer checks in favor of
  running until
  an allocation failure, but making sure we break the loop if we get
  two in a row, indicating we have at least 1 buffer, but not enough
  for the current receive packet
  6) restore non-vnet header handling
  
  Signed-Off-By: David L Stevens dlstev...@us.ibm.com
 
 Thanks!
 There's some whitespace damage, are you sending with your new
 sendmail setup? It seems to have worked for qemu patches ...
 
  diff -ruNp net-next-p0/drivers/vhost/net.c
  net-next-v3/drivers/vhost/net.c
  --- net-next-p0/drivers/vhost/net.c 2010-03-22 12:04:38.0 -0700
  +++ net-next-v3/drivers/vhost/net.c 2010-04-06 12:54:56.0 -0700
  @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
  hdr_size = vq-hdr_size;
   
  for (;;) {
  -   head = vhost_get_vq_desc(net-dev, vq, vq-iov,
  -ARRAY_SIZE(vq-iov),
  -out, in,
  +   head = vhost_get_desc(net-dev, vq, vq-iov,
  +ARRAY_SIZE(vq-iov), out, in,
   NULL, NULL);
  /* Nothing new?  Wait for eventfd to tell us they refilled. */
  if (head == vq-num) {
  @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
  /* TODO: Check specific error and bomb out unless ENOBUFS? */
  err = sock-ops-sendmsg(NULL, sock, msg, len);
  if (unlikely(err  0)) {
  -   vhost_discard_vq_desc(vq);
  -   tx_poll_start(net, sock);
  +   if (err == -EAGAIN) {
  +   vhost_discard_desc(vq, 1);
  +   tx_poll_start(net, sock);
  +   } else {
  +   vq_err(vq, sendmsg: errno %d\n, -err);
  +   /* drop packet; do not discard/resend */
  +   vhost_add_used_and_signal(net-dev, vq, head,
  + 0);
 
 vhost does not currently has a consistent error handling strategy:
 if we drop packets, need to think which other errors should cause
 packet drops.  I prefer to just call vq_err for now,
 and have us look at handling segfaults etc in a consistent way
 separately.
 
  +   }
  break;
  }
  if (err != len)
  @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
  unuse_mm(net-dev.mm);
   }
   
  +static int vhost_head_len(struct sock *sk)
  +{
  +   struct sk_buff *head;
  +   int len = 0;
  +
  +   lock_sock(sk);
  +   head = skb_peek(sk-sk_receive_queue);
  +   if (head)
  +   len = head-len;
  +   release_sock(sk);
  +   return len;
  +}
  +
 
 I wonder whether it makes sense to check
 skb_queue_empty(sk-sk_receive_queue)
 outside the lock, to reduce the cost of this call
 on an empty queue (we know that it happens at least once
 each time we exit the loop on rx)?
 
   /* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
   static void handle_rx(struct vhost_net *net)
   {
  struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_RX];
  -   unsigned head, out, in, log, s;
  +   unsigned in, log, s;
  struct vhost_log *vq_log;
  struct msghdr msg = {
  .msg_name = NULL,
  @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
  .msg_flags = MSG_DONTWAIT,
  };
   
  -   struct virtio_net_hdr hdr = {
  -   .flags = 0,
  -   .gso_type = VIRTIO_NET_HDR_GSO_NONE
  +   struct virtio_net_hdr_mrg_rxbuf hdr = {
  +   .hdr.flags = 0,
  +   .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
  };
   
  +   int retries = 0;
  size_t len, total_len = 0;
  -   int err;
  +   int err, headcount, datalen;
  size_t hdr_size;
  struct socket *sock = rcu_dereference(vq-private_data);
  if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
  @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
  vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
  vq-log : NULL;
   
  -   for (;;) {
  -   head

[GIT PULL] vhost-net fix for 2.6.34-rc3

2010-04-07 Thread Michael S. Tsirkin

David,
The following tree includes a patch fixing an issue with vhost-net in
2.6.34-rc3.  Please pull for 2.6.34.
 
Thanks!

The following changes since commit 2eaa9cfdf33b8d7fb7aff27792192e0019ae8fc6:

  Linux 2.6.34-rc3 (2010-03-30 09:24:39 -0700)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

Jeff Dike (1):
  vhost-net: fix vq_memory_access_ok error checking

 drivers/vhost/vhost.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net

2010-04-08 Thread Michael S. Tsirkin

On Wed, Apr 07, 2010 at 02:07:18PM -0700, David Stevens wrote:
 kvm-ow...@vger.kernel.org wrote on 04/07/2010 11:09:30 AM:
 
  On Wed, Apr 07, 2010 at 10:37:17AM -0700, David Stevens wrote:

Thanks!
There's some whitespace damage, are you sending with your new
sendmail setup? It seems to have worked for qemu patches ...
   
   Yes, I saw some line wraps in what I received, but I checked
   the original draft to be sure and they weren't there. Possibly from
   the relay; Sigh.
   
   
 @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
/* TODO: Check specific error and bomb out unless ENOBUFS? 
 */
err = sock-ops-sendmsg(NULL, sock, msg, len);
if (unlikely(err  0)) {
 - vhost_discard_vq_desc(vq);
 - tx_poll_start(net, sock);
 + if (err == -EAGAIN) {
 +vhost_discard_desc(vq, 1);
 +tx_poll_start(net, sock);
 + } else {
 +vq_err(vq, sendmsg: errno %d\n, -err);
 +/* drop packet; do not discard/resend */
 +vhost_add_used_and_signal(net-dev, vq, head,
 +   0);

vhost does not currently has a consistent error handling strategy:
if we drop packets, need to think which other errors should cause
packet drops.  I prefer to just call vq_err for now,
and have us look at handling segfaults etc in a consistent way
separately.
   
   I had to add this to avoid an infinite loop when I wrote a bad
   packet on the socket. I agree error handling needs a better look,
   but retrying a bad packet continuously while dumping in the log
   is what it was doing when I hit an error before this code. Isn't
   this better, at least until a second look?
   
  
  Hmm, what do you mean 'continuously'? Don't we only try again
  on next kick?
 
 If the packet is corrupt (in my case, a missing vnet header
 during testing), every send will fail and we never make progress.
 I had thousands of error messages in the log (for the same packet)
 before I added this code.

Hmm, we do not want a buggy guest to be able to fill
host logs. This is only if debugging is enabled though, right?
We can also rate-limit the errors.

 If the problem is with the packet,
 retrying the same one as the original code does will never recover.
 This isn't required for mergeable rx buffer support, so I
 can certainly remove it from this patch, but I think the original
 error handling doesn't handle a single corrupted packet very
 gracefully.
 

An approach I considered was to have qemu poll vq_err fd
and stop the device when an error is seen. My concern with
dropping a tx packet is that it would make debugging
very difficult.


 @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
 vq_log = unlikely(vhost_has_feature(net-dev, 
 VHOST_F_LOG_ALL)) ?
vq-log : NULL;
 
 -   for (;;) {
 -  head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -ARRAY_SIZE(vq-iov),
 -out, in,
 -vq_log, log);
 +   while ((datalen = vhost_head_len(sock-sk))) {
 +  headcount = vhost_get_desc_n(vq, vq-heads, datalen, in,
 +vq_log, log);

This looks like a bug, I think we need to pass
datalen + header size to vhost_get_desc_n.
Not sure how we know the header size that backend will use though.
Maybe just look at our features.
   
   Yes; we have hdr_size, so I can add it here. It'll be 0 for
   the cases where the backend and guest both have vnet header (either
   the regular or larger mergeable buffers one), but should be added
   in for future raw socket support.
  
  So hdr_size is the wrong thing to add then.
  We need to add non-zero value for tap now.
 
 datalen includes the vnet_hdr in the tap case, so we don't need
 a non-zero hdr_size. The socket data has the entire packet and vnet_hdr
 and that length is what we're getting from vhost_head_len().

I only see vhost_head_len returning skb-len. You are sure skb-len
includes vnet_hdr for tap rx?

  

/* OK, now we need to know about added descriptors. */
 -  if (head == vq-num) {
 - if (unlikely(vhost_enable_notify(vq))) {
 +  if (!headcount) {
 + if (retries == 0  unlikely(vhost_enable_notify(vq))) {
  /* They have slipped one in as we were
   * doing that: check again. */
  vhost_disable_notify(vq);
 +retries++;
  continue;
   }

Hmm. The reason we have the code at all, as the comment says, is 
 because
guest could have added more buffers between the time we read last 
 index
and the time we enabled notification. So if we just break like this
the race still exists. We could remember the
last head value we observed, and have

Re: [PATCH] vhost: Make it more scalable by creating a vhost thread per device.

2010-04-11 Thread Michael S. Tsirkin

On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote:
 On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote:
  On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote:
   On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote:
Make vhost scalable by creating a separate vhost thread per vhost
device. This provides better scaling across multiple guests and with
multiple interfaces in a guest.
   
   Thanks for looking into this. An alternative approach is
   to simply replace create_singlethread_workqueue with
   create_workqueue which would get us a thread per host CPU.
   
   It seems that in theory this should be the optimal approach
   wrt CPU locality, however, in practice a single thread
   seems to get better numbers. I have a TODO to investigate this.
   Could you try looking into this?
  
  Yes. I tried using create_workqueue(), but the results were not good
  atleast when the number of guest interfaces is less than the number
  of CPUs. I didn't try more than 8 guests.
  Creating a separate thread per guest interface seems to be more
  scalable based on the testing i have done so far.
  
  I will try some more tests and get some numbers to compare the following
  3 options.
  - single vhost thread
  - vhost thread per cpu
  - vhost thread per guest virtio interface
 
 Here are the results with netperf TCP_STREAM 64K guest to host on a
 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host 
 CPU utilization.
 
 Current default single vhost thread
 ---
 1 guest:  12500  37%
 2 guests: 12800  46%
 3 guests: 12600  47%
 4 guests: 12200  47%
 5 guests: 12000  47%
 6 guests: 11700  47%
 7 guests: 11340  47%
 8 guests: 11200  48%
 
 vhost thread per cpu
 
 1 guest:   4900 25%
 2 guests: 10800 49%
 3 guests: 17100 67%
 4 guests: 20400 84%
 5 guests: 21000 90%
 6 guests: 22500 92%
 7 guests: 23500 96%
 8 guests: 24500 99%
 
 vhost thread per guest interface
 
 1 guest:  12500 37%
 2 guests: 21000 72%
 3 guests: 21600 79%
 4 guests: 21600 85%
 5 guests: 22500 89%
 6 guests: 22800 94%
 7 guests: 24500 98%
 8 guests: 26400 99%
 
 Thanks
 Sridhar


Consider using Ingo's perf tool to get error bars, but looks good
overall. One thing I note though is that we seem to be able to
consume up to 99% CPU now. So I think with this approach
we can no longer claim that we are just like some other parts of
networking stack, doing work outside any cgroup, and we should
make the vhost thread inherit the cgroup and cpu mask
from the process calling SET_OWNER.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vhost: Make it more scalable by creating a vhost thread per device.

2010-04-12 Thread Michael S. Tsirkin

On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote:
 On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote:
  On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote:
   On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote:
Make vhost scalable by creating a separate vhost thread per vhost
device. This provides better scaling across multiple guests and with
multiple interfaces in a guest.
   
   Thanks for looking into this. An alternative approach is
   to simply replace create_singlethread_workqueue with
   create_workqueue which would get us a thread per host CPU.
   
   It seems that in theory this should be the optimal approach
   wrt CPU locality, however, in practice a single thread
   seems to get better numbers. I have a TODO to investigate this.
   Could you try looking into this?
  
  Yes. I tried using create_workqueue(), but the results were not good
  atleast when the number of guest interfaces is less than the number
  of CPUs. I didn't try more than 8 guests.
  Creating a separate thread per guest interface seems to be more
  scalable based on the testing i have done so far.
  
  I will try some more tests and get some numbers to compare the following
  3 options.
  - single vhost thread
  - vhost thread per cpu
  - vhost thread per guest virtio interface
 
 Here are the results with netperf TCP_STREAM 64K guest to host on a
 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host 
 CPU utilization.
 
 Current default single vhost thread
 ---
 1 guest:  12500  37%
 2 guests: 12800  46%
 3 guests: 12600  47%
 4 guests: 12200  47%
 5 guests: 12000  47%
 6 guests: 11700  47%
 7 guests: 11340  47%
 8 guests: 11200  48%
 
 vhost thread per cpu
 
 1 guest:   4900 25%
 2 guests: 10800 49%
 3 guests: 17100 67%
 4 guests: 20400 84%
 5 guests: 21000 90%
 6 guests: 22500 92%
 7 guests: 23500 96%
 8 guests: 24500 99%
 
 vhost thread per guest interface
 
 1 guest:  12500 37%
 2 guests: 21000 72%
 3 guests: 21600 79%
 4 guests: 21600 85%
 5 guests: 22500 89%
 6 guests: 22800 94%
 7 guests: 24500 98%
 8 guests: 26400 99%

We can also have a thread per vq. Does it help?

 Thanks
 Sridhar
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vhost: Make it more scalable by creating a vhost thread per device.

2010-04-12 Thread Michael S. Tsirkin

On Mon, Apr 12, 2010 at 10:35:31AM -0700, Sridhar Samudrala wrote:
 On Sun, 2010-04-11 at 18:47 +0300, Michael S. Tsirkin wrote:
  On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote:
   On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote:
On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote:
 On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote:
  Make vhost scalable by creating a separate vhost thread per vhost
  device. This provides better scaling across multiple guests and with
  multiple interfaces in a guest.
 
 Thanks for looking into this. An alternative approach is
 to simply replace create_singlethread_workqueue with
 create_workqueue which would get us a thread per host CPU.
 
 It seems that in theory this should be the optimal approach
 wrt CPU locality, however, in practice a single thread
 seems to get better numbers. I have a TODO to investigate this.
 Could you try looking into this?

Yes. I tried using create_workqueue(), but the results were not good
atleast when the number of guest interfaces is less than the number
of CPUs. I didn't try more than 8 guests.
Creating a separate thread per guest interface seems to be more
scalable based on the testing i have done so far.

I will try some more tests and get some numbers to compare the following
3 options.
- single vhost thread
- vhost thread per cpu
- vhost thread per guest virtio interface
   
   Here are the results with netperf TCP_STREAM 64K guest to host on a
   8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host 
   CPU utilization.
   
   Current default single vhost thread
   ---
   1 guest:  12500  37%
   2 guests: 12800  46%
   3 guests: 12600  47%
   4 guests: 12200  47%
   5 guests: 12000  47%
   6 guests: 11700  47%
   7 guests: 11340  47%
   8 guests: 11200  48%
   
   vhost thread per cpu
   
   1 guest:   4900 25%
   2 guests: 10800 49%
   3 guests: 17100 67%
   4 guests: 20400 84%
   5 guests: 21000 90%
   6 guests: 22500 92%
   7 guests: 23500 96%
   8 guests: 24500 99%
   
   vhost thread per guest interface
   
   1 guest:  12500 37%
   2 guests: 21000 72%
   3 guests: 21600 79%
   4 guests: 21600 85%
   5 guests: 22500 89%
   6 guests: 22800 94%
   7 guests: 24500 98%
   8 guests: 26400 99%
   
   Thanks
   Sridhar
  
  
  Consider using Ingo's perf tool to get error bars, but looks good
  overall. 
 
 What do you mean by getting error bars?

How noisy are the numbers?
I'd like to see something along the lines of 85% +- 2%

  One thing I note though is that we seem to be able to
  consume up to 99% CPU now. So I think with this approach
  we can no longer claim that we are just like some other parts of
  networking stack, doing work outside any cgroup, and we should
  make the vhost thread inherit the cgroup and cpu mask
  from the process calling SET_OWNER.
 
 Yes. I am not sure what is the right interface to do this,

I think we'll have to extend work queue API for this.

 but this should also allow binding qemu to a set of cpus and
 automatically having vhost thread inherit the same cpu mask.

For numa, yes. Also need to inherit cgroup.

 Thanks
 Sridhar

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 0/3] PCI Shared memory device

2010-04-12 Thread Michael S. Tsirkin

On Wed, Apr 07, 2010 at 04:51:57PM -0600, Cam Macdonell wrote:
 Latest patch for PCI shared memory device that maps a host shared memory 
 object
 to be shared between guests

FWIW, I still think it would be better to reuse virtio-pci spec
for feature negotiation, control etc even
if Anthony nacks the reuse of virtio code in qemu.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vhost-net: fix vq_memory_access_ok error checking

2010-04-13 Thread Michael S. Tsirkin

On Wed, Apr 07, 2010 at 09:59:10AM -0400, Jeff Dike wrote:
 vq_memory_access_ok needs to check whether mem == NULL
 
 Signed-off-by: Jeff Dike jd...@linux.intel.com
 Signed-off-by: Michael S. Tsirkin m...@redhat.com

This was already queued by me, you do not need to
fill Dave's inbox with vhost patches.

 ---
  drivers/vhost/vhost.c |4 
  1 files changed, 4 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index 7bd7a1e..b8e1127 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -235,6 +235,10 @@ static int vq_memory_access_ok(void __user *log_base, 
 struct vhost_memory *mem,
  int log_all)
  {
   int i;
 +
 +if (!mem)
 +return 0;
 +
   for (i = 0; i  mem-nregions; ++i) {
   struct vhost_memory_region *m = mem-regions + i;
   unsigned long a = m-userspace_addr;
 -- 
 1.7.0.2.280.gc6f05
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vhost-net: fix vq_memory_access_ok error checking

2010-04-13 Thread Michael S. Tsirkin

On Tue, Apr 13, 2010 at 06:01:21PM +0300, Michael S. Tsirkin wrote:
 On Wed, Apr 07, 2010 at 09:59:10AM -0400, Jeff Dike wrote:
  vq_memory_access_ok needs to check whether mem == NULL
  
  Signed-off-by: Jeff Dike jd...@linux.intel.com
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
 
 This was already queued by me, you do not need to
 fill Dave's inbox with vhost patches.

This was sent in error, please ignore.
Sorry about the noise.

  ---
   drivers/vhost/vhost.c |4 
   1 files changed, 4 insertions(+), 0 deletions(-)
  
  diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
  index 7bd7a1e..b8e1127 100644
  --- a/drivers/vhost/vhost.c
  +++ b/drivers/vhost/vhost.c
  @@ -235,6 +235,10 @@ static int vq_memory_access_ok(void __user *log_base, 
  struct vhost_memory *mem,
 int log_all)
   {
  int i;
  +
  +if (!mem)
  +return 0;
  +
  for (i = 0; i  mem-nregions; ++i) {
  struct vhost_memory_region *m = mem-regions + i;
  unsigned long a = m-userspace_addr;
  -- 
  1.7.0.2.280.gc6f05
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin

On Fri, Apr 02, 2010 at 03:25:00PM +0800, xiaohui@intel.com wrote:
 The idea is simple, just to pin the guest VM user space and then
 let host NIC driver has the chance to directly DMA to it. 
 The patches are based on vhost-net backend driver. We add a device
 which provides proto_ops as sendmsg/recvmsg to vhost-net to
 send/recv directly to/from the NIC driver. KVM guest who use the
 vhost-net backend may bind any ethX interface in the host side to
 get copyless data transfer thru guest virtio-net frontend.
 
 The scenario is like this:
 
 The guest virtio-net driver submits multiple requests thru vhost-net
 backend driver to the kernel. And the requests are queued and then
 completed after corresponding actions in h/w are done.
 
 For read, user space buffers are dispensed to NIC driver for rx when
 a page constructor API is invoked. Means NICs can allocate user buffers
 from a page constructor. We add a hook in netif_receive_skb() function
 to intercept the incoming packets, and notify the zero-copy device.
 
 For write, the zero-copy deivce may allocates a new host skb and puts
 payload on the skb_shinfo(skb)-frags, and copied the header to skb-data.
 The request remains pending until the skb is transmitted by h/w.
 
 Here, we have ever considered 2 ways to utilize the page constructor
 API to dispense the user buffers.
 
 One:  Modify __alloc_skb() function a bit, it can only allocate a 
   structure of sk_buff, and the data pointer is pointing to a 
   user buffer which is coming from a page constructor API.
   Then the shinfo of the skb is also from guest.
   When packet is received from hardware, the skb-data is filled
   directly by h/w. What we have done is in this way.
 
   Pros:   We can avoid any copy here.
   Cons:   Guest virtio-net driver needs to allocate skb as almost
   the same method with the host NIC drivers, say the size
   of netdev_alloc_skb() and the same reserved space in the
   head of skb. Many NIC drivers are the same with guest and
   ok for this. But some lastest NIC drivers reserves special
   room in skb head. To deal with it, we suggest to provide
   a method in guest virtio-net driver to ask for parameter
   we interest from the NIC driver when we know which device 
   we have bind to do zero-copy. Then we ask guest to do so.
   Is that reasonable?

Unfortunately, this would break compatibility with existing virtio.
This also complicates migration. What is the room in skb head used for?

 Two:  Modify driver to get user buffer allocated from a page constructor
   API(to substitute alloc_page()), the user buffer are used as payload
   buffers and filled by h/w directly when packet is received. Driver
   should associate the pages with skb (skb_shinfo(skb)-frags). For 
   the head buffer side, let host allocates skb, and h/w fills it. 
   After that, the data filled in host skb header will be copied into
   guest header buffer which is submitted together with the payload buffer.
 
   Pros:   We could less care the way how guest or host allocates their
   buffers.
   Cons:   We still need a bit copy here for the skb header.
 
 We are not sure which way is the better here.

The obvious question would be whether you see any speed difference
with the two approaches. If no, then the second approach would be
better.

 This is the first thing we want
 to get comments from the community. We wish the modification to the network
 part will be generic which not used by vhost-net backend only, but a user
 application may use it as well when the zero-copy device may provides async
 read/write operations later.
 
 Please give comments especially for the network part modifications.
 
 
 We provide multiple submits and asynchronous notifiicaton to 
 vhost-net too.
 
 Our goal is to improve the bandwidth and reduce the CPU usage.
 Exact performance data will be provided later. But for simple
 test with netperf, we found bindwidth up and CPU % up too,
 but the bindwidth up ratio is much more than CPU % up ratio.
 
 What we have not done yet:
   packet split support

What does this mean, exactly?

   To support GRO

And TSO/GSO?

   Performance tuning
 
 what we have done in v1:
   polish the RCU usage
   deal with write logging in asynchroush mode in vhost
   add notifier block for mp device
   rename page_ctor to mp_port in netdevice.h to make it looks generic
   add mp_dev_change_flags() for mp device to change NIC state
   add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
   a small fix for missing dev_put when fail
   using dynamic minor instead of static minor number
   a __KERNEL__ protect to mp_get_sock()
 
 what we have done in v2:
   
   remove most of the RCU usage, since the ctor pointer is only
   changed by

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin

On Wed, Apr 14, 2010 at 04:55:21PM +0200, Arnd Bergmann wrote:
 On Friday 09 April 2010, xiaohui@intel.com wrote:
  From: Xin Xiaohui xiaohui@intel.com
  
  Add a device to utilize the vhost-net backend driver for
  copy-less data transfer between guest FE and host NIC.
  It pins the guest user space to the host memory and
  provides proto_ops as sendmsg/recvmsg to vhost-net.
 
 Sorry for taking so long before finding the time to look
 at your code in more detail.
 
 It seems that you are duplicating a lot of functionality that
 is already in macvtap. I've asked about this before but then
 didn't look at your newer versions. Can you explain the value
 of introducing another interface to user land?

Hmm, I have not noticed a lot of duplication.
BTW macvtap also duplicates tun code, it might be
a good idea for tun to export some functionality.

 I'm still planning to add zero-copy support to macvtap,
 hopefully reusing parts of your code, but do you think there
 is value in having both?

If macvtap would get zero copy tx and rx, maybe not. But
it's not immediately obvious whether zero-copy support
for macvtap might work, though, especially for zero copy rx.
The approach with mpassthru is much simpler in that
it takes complete control of the device.

  diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
  new file mode 100644
  index 000..86d2525
  --- /dev/null
  +++ b/drivers/vhost/mpassthru.c
  @@ -0,0 +1,1264 @@
  +
  +#ifdef MPASSTHRU_DEBUG
  +static int debug;
  +
  +#define DBG  if (mp-debug) printk
  +#define DBG1 if (debug == 2) printk
  +#else
  +#define DBG(a...)
  +#define DBG1(a...)
  +#endif
 
 This should probably just use the existing dev_dbg/pr_debug infrastructure.
 
  [... skipping buffer management code for now]
 
  +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
  +   struct msghdr *m, size_t total_len)
  +{
  [...]
 
 This function looks like we should be able to easily include it into
 macvtap and get zero-copy transmits without introducing the new
 user-level interface.
 
  +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
  +   struct msghdr *m, size_t total_len,
  +   int flags)
  +{
  +   struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
  +   struct page_ctor *ctor;
  +   struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb-private);
 
 It smells like a layering violation to look at the iocb-private field
 from a lower-level driver. I would have hoped that it's possible to implement
 this without having this driver know about the higher-level vhost driver
 internals.

I agree.

 Can you explain why this is needed?
 
  +   spin_lock_irqsave(ctor-read_lock, flag);
  +   list_add_tail(info-list, ctor-readq);
  +   spin_unlock_irqrestore(ctor-read_lock, flag);
  +
  +   if (!vq-receiver) {
  +   vq-receiver = mp_recvmsg_notify;
  +   set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
  +  vq-num * 4096,
  +  vq-num * 4096);
  +   }
  +
  +   return 0;
  +}
 
 Not sure what I'm missing, but who calls the vq-receiver? This seems
 to be neither in the upstream version of vhost nor introduced by your
 patch.
 
  +static void __mp_detach(struct mp_struct *mp)
  +{
  +   mp-mfile = NULL;
  +
  +   mp_dev_change_flags(mp-dev, mp-dev-flags  ~IFF_UP);
  +   page_ctor_detach(mp);
  +   mp_dev_change_flags(mp-dev, mp-dev-flags | IFF_UP);
  +
  +   /* Drop the extra count on the net device */
  +   dev_put(mp-dev);
  +}
  +
  +static DEFINE_MUTEX(mp_mutex);
  +
  +static void mp_detach(struct mp_struct *mp)
  +{
  +   mutex_lock(mp_mutex);
  +   __mp_detach(mp);
  +   mutex_unlock(mp_mutex);
  +}
  +
  +static void mp_put(struct mp_file *mfile)
  +{
  +   if (atomic_dec_and_test(mfile-count))
  +   mp_detach(mfile-mp);
  +}
  +
  +static int mp_release(struct socket *sock)
  +{
  +   struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
  +   struct mp_file *mfile = mp-mfile;
  +
  +   mp_put(mfile);
  +   sock_put(mp-socket.sk);
  +   put_net(mfile-net);
  +
  +   return 0;
  +}
 
 Doesn't this prevent the underlying interface from going away while the 
 chardev
 is open? You also have logic to handle that case, so why do you keep the extra
 reference on the netdev?
 
  +/* Ops structure to mimic raw sockets with mp device */
  +static const struct proto_ops mp_socket_ops = {
  +   .sendmsg = mp_sendmsg,
  +   .recvmsg = mp_recvmsg,
  +   .release = mp_release,
  +};
 
  +static int mp_chr_open(struct inode *inode, struct file * file)
  +{
  +   struct mp_file *mfile;
  +   cycle_kernel_lock();
 
 I don't think you really want to use the BKL here, just kill that line.
 
  +static long mp_chr_ioctl(struct file *file, unsigned int cmd,
  +   unsigned long arg)
  +{
  +   struct mp_file *mfile = file-private_data;
  +   struct mp_struct *mp;
  +   struct net_device *dev;
  +

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin

On Wed, Apr 14, 2010 at 05:57:54PM +0200, Arnd Bergmann wrote:
 On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
  On Wed, Apr 14, 2010 at 04:55:21PM +0200, Arnd Bergmann wrote:
   On Friday 09 April 2010, xiaohui@intel.com wrote:
From: Xin Xiaohui xiaohui@intel.com
 
   It seems that you are duplicating a lot of functionality that
   is already in macvtap. I've asked about this before but then
   didn't look at your newer versions. Can you explain the value
   of introducing another interface to user land?
  
  Hmm, I have not noticed a lot of duplication.
 
 The code is indeed quite distinct, but the idea of adding another
 character device to pass into vhost for direct device access is.

All backends besides tap seem to do this, btw :)

  BTW macvtap also duplicates tun code, it might be
  a good idea for tun to export some functionality.
 
 Yes, that's something I plan to look into.
 
   I'm still planning to add zero-copy support to macvtap,
   hopefully reusing parts of your code, but do you think there
   is value in having both?
  
  If macvtap would get zero copy tx and rx, maybe not. But
  it's not immediately obvious whether zero-copy support
  for macvtap might work, though, especially for zero copy rx.
  The approach with mpassthru is much simpler in that
  it takes complete control of the device.
 
 As far as I can tell, the most significant limitation of mpassthru
 is that there can only ever be a single guest on a physical NIC.
 
 Given that limitation, I believe we can do the same on macvtap,
 and simply disable zero-copy RX when you want to use more than one
 guest, or both guest and host on the same NIC.
 
 The logical next step here would be to allow VMDq and similar
 technologies to separate out the RX traffic in the hardware.
 We don't have a configuration interface for that yet, but
 since this is logically the same as macvlan, I think we should
 use the same interfaces for both, essentially treating VMDq
 as a hardware acceleration for macvlan. We can probably handle
 it in similar ways to how we handle hardware support for vlan.
 
 At that stage, macvtap would be the logical interface for
 connecting a VMDq (hardware macvlan) device to a guest!

I won't object to that but ... code walks.

+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec 
*iov,
+   unsigned long count, loff_t pos)
+{
+   struct file *file = iocb-ki_filp;
+   struct mp_struct *mp = mp_get(file-private_data);
+   struct sock *sk = mp-socket.sk;
+   struct sk_buff *skb;
+   int len, err;
+   ssize_t result;
   
   Can you explain what this function is even there for? AFAICT, vhost-net
   doesn't call it, the interface is incompatible with the existing
   tap interface, and you don't provide a read function.
  
  qemu needs the ability to inject raw packets into device
  from userspace, bypassing vhost/virtio (for live migration).
 
 Ok, but since there is only a write callback and no read, it won't
 actually be able to do this with the current code, right?

I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).

 Moreover, it seems weird to have a new type of interface here that
 duplicates tap/macvtap with less functionality. Coming back
 to your original comment, this means that while mpassthru is currently
 not duplicating the actual code from macvtap, it would need to do
 exactly that to get the qemu interface right!
 
   Arnd

I don't think so, see above. anyway, both can reuse tun.c :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin

On Wed, Apr 14, 2010 at 06:35:57PM +0200, Arnd Bergmann wrote:
 On Wednesday 14 April 2010, Michael S. Tsirkin wrote:

qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
   
   Ok, but since there is only a write callback and no read, it won't
   actually be able to do this with the current code, right?
  
  I think it'll work as is, with vhost qemu only ever writes,
  never reads from device. We'll also never need GSO etc
  which is a large part of what tap does (and macvtap will
  have to do).
 
 Ah, I see. I didn't realize that qemu needs to write to the
 device even if vhost is used. But for the case of migration to
 another machine without vhost, wouldn't qemu also need to read?

Not that I know. Why?

   Moreover, it seems weird to have a new type of interface here that
   duplicates tap/macvtap with less functionality. Coming back
   to your original comment, this means that while mpassthru is currently
   not duplicating the actual code from macvtap, it would need to do
   exactly that to get the qemu interface right!
   
  I don't think so, see above. anyway, both can reuse tun.c :)
 
 There is one significant difference between macvtap/mpassthru and
 tun/tap in that the directions are reversed. While macvtap and
 mpassthru forward data from write into dev_queue_xmit and from
 skb_receive into read, tun/tap forwards data from write into
 skb_receive and from start_xmit into read.
 
 Also, I'm not really objecting to duplicating code between
 macvtap and mpassthru, as the implementation can always be merged.
 My main objection is instead to having two different _user_interfaces_
 for doing the same thing.
 
   Arnd

They *could* do the same thing :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin

On Wed, Apr 14, 2010 at 10:39:49PM +0200, Arnd Bergmann wrote:
 On Wednesday 14 April 2010 22:31:42 Michael S. Tsirkin wrote:
  On Wed, Apr 14, 2010 at 06:35:57PM +0200, Arnd Bergmann wrote:
   On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
  
  qemu needs the ability to inject raw packets into device
  from userspace, bypassing vhost/virtio (for live migration).
 
 Ok, but since there is only a write callback and no read, it won't
 actually be able to do this with the current code, right?

I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
   
   Ah, I see. I didn't realize that qemu needs to write to the
   device even if vhost is used. But for the case of migration to
   another machine without vhost, wouldn't qemu also need to read?
  
  Not that I know. Why?
 
 Well, if the guest not only wants to send data but also receive
 frames coming from other machines, they need to get from the kernel
 into qemu, and the only way I can see for doing that is to read
 from this device if there is no vhost support around on the new
 machine.
 
 Maybe we're talking about different things here.
 
   Arnd

mpassthrough is currently useless without vhost.
If the new machine has no vhost, it can't use mpassthrough :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-15 Thread Michael S. Tsirkin

On Thu, Apr 15, 2010 at 05:01:10PM +0800, Xin, Xiaohui wrote:
 It smells like a layering violation to look at the iocb-private field
 from a lower-level driver. I would have hoped that it's possible to implement
 this without having this driver know about the higher-level vhost driver
 internals. Can you explain why this is needed?
 
 I don't like this too, but since the kiocb is maintained by vhost with a 
 list_head.
 And mp device is responsible to collect the kiocb into the list_head,
 We need something known by vhost/mp both.

Can't vhost supply a kiocb completion callback that will handle the list?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-15 Thread Michael S. Tsirkin

On Thu, Apr 15, 2010 at 05:36:07PM +0800, Xin, Xiaohui wrote:
 
 Michael,
  The idea is simple, just to pin the guest VM user space and then
  let host NIC driver has the chance to directly DMA to it. 
  The patches are based on vhost-net backend driver. We add a device
  which provides proto_ops as sendmsg/recvmsg to vhost-net to
  send/recv directly to/from the NIC driver. KVM guest who use the
  vhost-net backend may bind any ethX interface in the host side to
  get copyless data transfer thru guest virtio-net frontend.
  
  The scenario is like this:
  
  The guest virtio-net driver submits multiple requests thru vhost-net
  backend driver to the kernel. And the requests are queued and then
  completed after corresponding actions in h/w are done.
  
  For read, user space buffers are dispensed to NIC driver for rx when
  a page constructor API is invoked. Means NICs can allocate user buffers
  from a page constructor. We add a hook in netif_receive_skb() function
  to intercept the incoming packets, and notify the zero-copy device.
  
  For write, the zero-copy deivce may allocates a new host skb and puts
  payload on the skb_shinfo(skb)-frags, and copied the header to skb-data.
  The request remains pending until the skb is transmitted by h/w.
  
  Here, we have ever considered 2 ways to utilize the page constructor
  API to dispense the user buffers.
  
  One:   Modify __alloc_skb() function a bit, it can only allocate a 
 structure of sk_buff, and the data pointer is pointing to a 
 user buffer which is coming from a page constructor API.
 Then the shinfo of the skb is also from guest.
 When packet is received from hardware, the skb-data is filled
 directly by h/w. What we have done is in this way.
  
 Pros:   We can avoid any copy here.
 Cons:   Guest virtio-net driver needs to allocate skb as almost
 the same method with the host NIC drivers, say the size
 of netdev_alloc_skb() and the same reserved space in the
 head of skb. Many NIC drivers are the same with guest and
 ok for this. But some lastest NIC drivers reserves special
 room in skb head. To deal with it, we suggest to provide
 a method in guest virtio-net driver to ask for parameter
 we interest from the NIC driver when we know which device 
 we have bind to do zero-copy. Then we ask guest to do so.
 Is that reasonable?
 
 Unfortunately, this would break compatibility with existing virtio.
 This also complicates migration. 
 
 You mean any modification to the guest virtio-net driver will break the
 compatibility? We tried to enlarge the virtio_net_config to contains the
 2 parameter, and add one VIRTIO_NET_F_PASSTHRU flag, virtionet_probe()
 will check the feature flag, and get the parameters, then virtio-net driver 
 use
 it to allocate buffers. How about this?

This means that we can't, for example, live-migrate between different systems
without flushing outstanding buffers.

 What is the room in skb head used for?
 I'm not sure, but the latest ixgbe driver does this, it reserves 32 bytes 
 compared to
 NET_IP_ALIGN.

Looking at code, this seems to do with alignment - could just be
a performance optimization.

  Two:   Modify driver to get user buffer allocated from a page 
  constructor
 API(to substitute alloc_page()), the user buffer are used as payload
 buffers and filled by h/w directly when packet is received. Driver
 should associate the pages with skb (skb_shinfo(skb)-frags). For 
 the head buffer side, let host allocates skb, and h/w fills it. 
 After that, the data filled in host skb header will be copied into
 guest header buffer which is submitted together with the payload buffer.
  
 Pros:   We could less care the way how guest or host allocates their
 buffers.
 Cons:   We still need a bit copy here for the skb header.
  
  We are not sure which way is the better here.
 
 The obvious question would be whether you see any speed difference
 with the two approaches. If no, then the second approach would be
 better.
 
 I remember the second approach is a bit slower in 1500MTU. 
 But we did not tested too much.

Well, that's an important datapoint. By the way, you'll need
header copy to activate LRO in host, so that's a good
reason to go with option 2 as well.

  This is the first thing we want
  to get comments from the community. We wish the modification to the network
  part will be generic which not used by vhost-net backend only, but a user
  application may use it as well when the zero-copy device may provides async
  read/write operations later.
  
  Please give comments especially for the network part modifications.
  
  
  We provide multiple submits and asynchronous notifiicaton to 
 vhost-net too.
  
  Our goal is to improve the bandwidth and reduce the CPU usage.
  Exact performance data will be provided later. But for simple

Re: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-19 Thread Michael S. Tsirkin

On Mon, Apr 19, 2010 at 06:05:17PM +0800, Xin, Xiaohui wrote:
  Michael,
   The idea is simple, just to pin the guest VM user space and then
   let host NIC driver has the chance to directly DMA to it. 
   The patches are based on vhost-net backend driver. We add a device
   which provides proto_ops as sendmsg/recvmsg to vhost-net to
   send/recv directly to/from the NIC driver. KVM guest who use the
   vhost-net backend may bind any ethX interface in the host side to
   get copyless data transfer thru guest virtio-net frontend.
   
   The scenario is like this:
   
   The guest virtio-net driver submits multiple requests thru vhost-net
   backend driver to the kernel. And the requests are queued and then
   completed after corresponding actions in h/w are done.
   
   For read, user space buffers are dispensed to NIC driver for rx when
   a page constructor API is invoked. Means NICs can allocate user buffers
   from a page constructor. We add a hook in netif_receive_skb() function
   to intercept the incoming packets, and notify the zero-copy device.
   
   For write, the zero-copy deivce may allocates a new host skb and puts
   payload on the skb_shinfo(skb)-frags, and copied the header to 
   skb-data.
   The request remains pending until the skb is transmitted by h/w.
   
   Here, we have ever considered 2 ways to utilize the page constructor
   API to dispense the user buffers.
   
   One:Modify __alloc_skb() function a bit, it can only allocate a 
   structure of sk_buff, and the data pointer is pointing to a 
   user buffer which is coming from a page constructor API.
   Then the shinfo of the skb is also from guest.
   When packet is received from hardware, the skb-data is filled
   directly by h/w. What we have done is in this way.
   
   Pros:   We can avoid any copy here.
   Cons:   Guest virtio-net driver needs to allocate skb as almost
   the same method with the host NIC drivers, say the size
   of netdev_alloc_skb() and the same reserved space in the
   head of skb. Many NIC drivers are the same with guest 
   and
   ok for this. But some lastest NIC drivers reserves 
   special
   room in skb head. To deal with it, we suggest to provide
   a method in guest virtio-net driver to ask for parameter
   we interest from the NIC driver when we know which 
   device 
   we have bind to do zero-copy. Then we ask guest to do 
   so.
   Is that reasonable?
  Unfortunately, this would break compatibility with existing virtio.
  This also complicates migration.  
  You mean any modification to the guest virtio-net driver will break the
  compatibility? We tried to enlarge the virtio_net_config to contains the
  2 parameter, and add one VIRTIO_NET_F_PASSTHRU flag, virtionet_probe()
  will check the feature flag, and get the parameters, then virtio-net 
  driver use
  it to allocate buffers. How about this?
 
 This means that we can't, for example, live-migrate between different systems
 without flushing outstanding buffers.
 
 Ok. What we have thought about now is to do something with skb_reserve().
 If the device is binded by mp, then skb_reserve() will do nothing with it.
 
  What is the room in skb head used for?
  I'm not sure, but the latest ixgbe driver does this, it reserves 32 bytes 
  compared to
  NET_IP_ALIGN.
 
 Looking at code, this seems to do with alignment - could just be
 a performance optimization.
 
   Two:Modify driver to get user buffer allocated from a page 
   constructor
   API(to substitute alloc_page()), the user buffer are used as 
   payload
   buffers and filled by h/w directly when packet is received. 
   Driver
   should associate the pages with skb (skb_shinfo(skb)-frags). 
   For 
   the head buffer side, let host allocates skb, and h/w fills it. 
   After that, the data filled in host skb header will be copied 
   into
   guest header buffer which is submitted together with the 
   payload buffer.
   
   Pros:   We could less care the way how guest or host allocates 
   their
   buffers.
   Cons:   We still need a bit copy here for the skb header.
   
   We are not sure which way is the better here. 
  The obvious question would be whether you see any speed difference
  with the two approaches. If no, then the second approach would be
  better.
  
  I remember the second approach is a bit slower in 1500MTU. 
  But we did not tested too much.
 
 Well, that's an important datapoint. By the way, you'll need
 header copy to activate LRO in host, so that's a good
 reason to go with option 2 as well.
 
 
   This is the first thing we want
   to get comments from the community. We wish the modification to the 
   network
   part will be generic which not used by vhost-net backend only, but a

Re: [PATCH] virtio-spec: document block CMD and FLUSH

2010-04-19 Thread Michael S. Tsirkin

On Fri, Feb 19, 2010 at 12:22:20AM +0200, Michael S. Tsirkin wrote:
 I took a stub at documenting CMD and FLUSH request types in virtio
 block.

Any comments?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-21 Thread Michael S. Tsirkin

On Tue, Apr 20, 2010 at 10:21:55AM +0800, Xin, Xiaohui wrote:
 Michael,
 
  What we have not done yet:
 packet split support
  
 What does this mean, exactly?
  We can support 1500MTU, but for jumbo frame, since vhost driver before 
  don't 
 support mergeable buffer, we cannot try it for multiple sg.
  
 I do not see why, vhost currently supports 64K buffers with indirect
 descriptors.
  
  The receive_skb() in guest virtio-net driver will merge the multiple sg to 
  skb frags, how can indirect descriptors to that?
 
 See add_recvbuf_big.
 
 I don't mean this, it's for buffer submission. I mean when packet is 
 received, in receive_buf(), mergeable buffer knows which pages received can 
 be hooked in skb frags, it's receive_mergeable() which do this.
 
 When a NIC driver supports packet split mode, then each ring descriptor 
 contains a skb and a page. When packet is received, if the status is not EOP, 
 then hook the page of the next descriptor to the prev skb. We don't how many 
 frags belongs to one skb. So when guest submit buffers, it should submit 
 multiple pages, and when receive, the guest should know which pages are 
 belongs to one skb and hook them together. I think receive_mergeable() can do 
 this, but I don't see how big-packets handle this. May I miss something here?
 
 Thanks
 Xiaohui 


Yes, I think this packet split mode probably maps well to mergeable buffer
support. Note that
1. Not all devices support large packets in this way, others might map
   to indirect buffers better
   So we have to figure out how migration is going to work
2. It's up to guest driver whether to enable features such as
   mergeable buffers and indirect buffers
   So we have to figure out how to notify guest which mode
   is optimal for a given device
3. We don't want to depend on jumbo frames for decent performance
   So we probably should support GSO/GRO

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V3] drivers/uio/uio_pci_generic.c: allow access for non-privileged processes

2010-04-21 Thread Michael S. Tsirkin

On Mon, Apr 19, 2010 at 03:05:35PM -0700, Tom Lyon wrote:
 
 These are changes to uio_pci_generic.c to allow better use of the driver by
 non-privileged processes.
 1. Add back old code which allowed interrupt re-enablement through uio fd.
 2. Translate PCI bards to uio mmap regions, to allow mmap through uio fd.

Since it's common for drivers to need configuration cycles
for device control, the above 2 are not sufficient for generic devices.
And if you fix the above, you won't need irqcontrol,
which IMO we are better off saving for stuff like eventfd mapping.

Also - this modifies a kernel/userspace interface in a way
that makes an operation that was always safe previously
potentially unsafe.

Also, BAR regions could be less than 1 page in size,
mapping these to unpriveledged process is a security problem.

Also, for a generic driver, we likely need write combining
support in the interface.

Also, io space often can not be mmaped. We need read/write
for that.

 3. Allow devices which support MSI or MSI-X, but not IRQ.

If the device supports MSI or MSI-X, it can perform
PCI writes upstream, and MSI-X vectors are controlled
through memory. So with MSI-X + mmap to an unpriveledged
process you can easily cause the device to modify any memory.

With MSI it's hard to be sure, maybe some devices might make guarantees
not to do writes except for MSI, but there's no generic way to declare
that: bus master needs to be enabled for MSI to work, and once bus
master is enabled, nothing seems to prevent the device from corrupting
host memory.


   Signed-off-by: Tom Lyon p...@cisco.com

So the patch doesn't look like generic enough or safe enough
for users I have in mind (virtualization). What users/devices
do you have in mind?

For virtualization, I've been thinking about unpriviledged access and
msi as well, and here's a plan I thought might work:

- add a uio_iommu character device that controls an iommu domain
- uio_iommu would make sure iommu is programmed in a safe way
- use irqcontrol to bind pci device to iommu domain
- allow config cycles through uio fd, but
  force bus master to 0 unless device is bound to a domain
- for sub-page regions, and io, we can't allow mmap to an unpriveledged
  process. extend irqcontrol to allow read/write and range-check the
  operations
- for msi/msix, drivers use multiple vectors. One idea is to
  map them by binding an eventfd to a vector. This approach has
  the advantage that in virtualization space, kvm already
  can consume eventfd descriptors.

That's it at a high level

 ---
 checkpatch.pl is happy with this one.
 
 --- linux-2.6.33/drivers/uio/uio_pci_generic.c2010-02-24 
 10:52:17.0 -0800
 +++ mylinux-2.6.33/drivers/uio/uio_pci_generic.c  2010-04-19 
 14:57:21.0 -0700
 @@ -14,9 +14,10 @@
   * # ls -l /sys/bus/pci/devices/:00:19.0/driver
   * .../:00:19.0/driver - ../../../bus/pci/drivers/uio_pci_generic
   *
 - * Driver won't bind to devices which do not support the Interrupt Disable 
 Bit
 - * in the command register. All devices compliant to PCI 2.3 (circa 2002) and
 - * all compliant PCI Express devices should support this bit.
 + * Driver won't bind to devices which do not support MSI, MSI-x, or the
 + * Interrupt Disable Bit in the command register. All devices compliant
 + * to PCI 2.3 (circa 2002) and all compliant PCI Express devices should
 + * support one of these.
   */
  
  #include linux/device.h
 @@ -27,7 +28,7 @@
  
  #define DRIVER_VERSION   0.01.0
  #define DRIVER_AUTHORMichael S. Tsirkin m...@redhat.com
 -#define DRIVER_DESC  Generic UIO driver for PCI 2.3 devices
 +#define DRIVER_DESC  Generic UIO driver for PCIe  PCI 2.3 devices
  
  struct uio_pci_generic_dev {
   struct uio_info info;
 @@ -41,6 +42,39 @@ to_uio_pci_generic_dev(struct uio_info *
   return container_of(info, struct uio_pci_generic_dev, info);
  }
  
 +/* Read/modify/write command register to disable interrupts.
 + * Note: we could cache the value and optimize the read if there was a way to
 + * get notified of user changes to command register through sysfs.
 + * */
 +static void irqtoggle(struct uio_pci_generic_dev *gdev, int irq_on)
 +{
 + struct pci_dev *pdev = gdev-pdev;
 + unsigned long flags;
 + u16 orig, new;
 +
 + spin_lock_irqsave(gdev-lock, flags);
 + pci_block_user_cfg_access(pdev);
 + pci_read_config_word(pdev, PCI_COMMAND, orig);
 + new = irq_on ? (orig  ~PCI_COMMAND_INTX_DISABLE)
 +  : (orig | PCI_COMMAND_INTX_DISABLE);
 + if (new != orig)
 + pci_write_config_word(gdev-pdev, PCI_COMMAND, new);
 + pci_unblock_user_cfg_access(pdev);
 + spin_unlock_irqrestore(gdev-lock, flags);
 +}
 +
 +/* irqcontrol is use by userspace to enable/disable interrupts. */
 +/* A privileged app can write the PCI_COMMAND register directly,
 + * but we need this for normal apps
 + */
 +static int irqcontrol(struct uio_info *info, s32 irq_on)
 +{
 + struct

Re: [PATCH V3] drivers/uio/uio_pci_generic.c: allow access for non-privileged processes

2010-04-21 Thread Michael S. Tsirkin

On Wed, Apr 21, 2010 at 12:31:50PM +0200, Hans J. Koch wrote:
 On Wed, Apr 21, 2010 at 12:38:49PM +0300, Michael S. Tsirkin wrote:
  
   + j++;
   + }
   + }
   + for (i = 0, j = 0; i  PCI_STD_RESOURCE_END 
   +j  MAX_UIO_PORT_REGIONS; i++) {
   + if (pci_resource_flags(pdev, i)  IORESOURCE_IO) {
   + name = kmalloc(8, GFP_KERNEL);
   + if (name == NULL)
   + break;
   + sprintf(name, iobar%d, i);
   + info-port[j].name = name;
   + info-port[j].start = pci_resource_start(pdev, i);
   + info-port[j].size = pci_resource_len(pdev, i);
   + info-port[j].porttype = UIO_PORT_X86;
   + j++;
  
  At least on x86, I think io bar can not be mmapped.
 
 That's right. porttype == UIO_PORT_X86 is only there for information
 purposes. Userspace then knows that it cannot map this but has to use
 things like inb(), outb() and friends after getting access rights with
 ioperm()/iopl(). start and size gives userspace the information
 needed to do this.
 
 Thanks,
 Hans

So that fails in the declared purpose of allowing an unpriveledged
userspace driver, as inb/outb are priveledged operations.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH] virtio-spec: document block CMD and FLUSH

2010-04-21 Thread Michael S. Tsirkin

On Tue, Apr 20, 2010 at 02:22:58PM +0100, Paul Brook wrote:
  Does this mean that virtio-blk supports all three combinations?
  
 1. FLUSH that isn't a barrier
 2. FLUSH that is also a barrier
 3. Barrier that is not a flush
  
  1 is good for fsync-like operations;
  2 is good for journalling-like ordered operations.
  3 sounds like it doesn't mean a lot as the host cache provides no
  guarantees and has no ordering facility that can be used.
 
 (3) allows the guest to queue overlapping transfers with well defined results.
 I have no idea how useful this is in practice, but it's certainly plausible.
 
 Paul

In theory, yes.
At the moment, qemu only implements FLUSH and lguest only
implements barrier without FLUSH.

If you think it's useful, maybe start by using FLUSH+barrier
in linux guest driver, that'd demonstrate how it's used.


-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-22 Thread Michael S. Tsirkin

On Thu, Apr 22, 2010 at 04:57:56PM +0800, Xin, Xiaohui wrote:
 Michael,
 
 Yes, I think this packet split mode probably maps well to mergeable buffer
 support. Note that
 1. Not all devices support large packets in this way, others might map
to indirect buffers better
 
 Do the indirect buffers accord to deal with the skb-frag_list?

We currently use skb-frags.

So we have to figure out how migration is going to work
 Yes, different guest virtio-net driver may contain different features.
 Does the qemu migration work with different features supported by virtio-net
 driver now?

For now, you must have identical feature-sets for migration to work.
And long as we manage the buffers in software, we can always make
features match.

 2. It's up to guest driver whether to enable features such as
mergeable buffers and indirect buffers
So we have to figure out how to notify guest which mode
is optimal for a given device
 Yes. When a device is binded, the mp device may query the capabilities from 
 driver.
 Actually, there is a structure now in mp device can do this, we can add some 
 field
 to support more.
 
 3. We don't want to depend on jumbo frames for decent performance
So we probably should support GSO/GRO
 GSO is for the tx side, right? I think driver can handle it itself.
 For GRO, I'm not sure it's easy or not. Basically, the mp device now
 we have support is doing what raw socket is doing. The packets are not going 
 to host stack.

See commit bfd5f4a3d605e0f6054df0b59fe0907ff7e696d3
(it doesn't currently work with vhost net, but that's
 a separate story).

 -- 
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.

2010-04-22 Thread Michael S. Tsirkin

On Thu, Apr 22, 2010 at 04:37:16PM +0800, xiaohui@intel.com wrote:
 From: Xin Xiaohui xiaohui@intel.com
 
 The vhost-net backend now only supports synchronous send/recv
 operations. The patch provides multiple submits and asynchronous
 notifications. This is needed for zero-copy case.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 ---
 
 Michael,
 
 Can't vhost supply a kiocb completion callback that will handle the list?
 
 Yes, thanks. And with it I also remove the vq-receiver finally.
 
 Thanks
 Xiaohui

Nice progress. I commented on some minor issues below.
Thanks!

  drivers/vhost/net.c   |  227 
 +++--
  drivers/vhost/vhost.c |  115 ++---
  drivers/vhost/vhost.h |   14 +++
  3 files changed, 301 insertions(+), 55 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 22d5fef..4a70f66 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -17,11 +17,13 @@
  #include linux/workqueue.h
  #include linux/rcupdate.h
  #include linux/file.h
 +#include linux/aio.h
  
  #include linux/net.h
  #include linux/if_packet.h
  #include linux/if_arp.h
  #include linux/if_tun.h
 +#include linux/mpassthru.h
  #include net/sock.h
  
 @@ -47,6 +49,7 @@ struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 + struct kmem_cache   *cache;
   /* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
 @@ -91,11 +94,132 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
   net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + unsigned long flags;
 +
 + spin_lock_irqsave(vq-notify_lock, flags);
 + if (!list_empty(vq-notifier)) {
 + iocb = list_first_entry(vq-notifier,
 + struct kiocb, ki_list);
 + list_del(iocb-ki_list);
 + }
 + spin_unlock_irqrestore(vq-notify_lock, flags);
 + return iocb;
 +}
 +
 +static void handle_iocb(struct kiocb *iocb)
 +{
 + struct vhost_virtqueue *vq = iocb-private;
 + unsigned long flags;
 +
 +spin_lock_irqsave(vq-notify_lock, flags);
 +list_add_tail(iocb-ki_list, vq-notifier);
 +spin_unlock_irqrestore(vq-notify_lock, flags);
 +}
 +

checkpatch.pl does not complain about the above?

 +static void handle_async_rx_events_notify(struct vhost_net *net,
 +  struct vhost_virtqueue *vq,
 +  struct socket *sock)

continuation lines should start to the right of (.

 +{
 + struct kiocb *iocb = NULL;
 + struct vhost_log *vq_log = NULL;
 + int rx_total_len = 0;
 + unsigned int head, log, in, out;
 + int size;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + if (sock-sk-sk_data_ready)
 + sock-sk-sk_data_ready(sock-sk, 0);
 +
 + vq_log = unlikely(vhost_has_feature(
 + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;

split the above line at ?, continuation being to the left of ( looks
ugly.

 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, iocb-ki_nbytes);
 + log = (int)(iocb-ki_user_data  32);

how about we always do the recompute step, and not encode
the log bit in ki_user_data?

 + size = iocb-ki_nbytes;
 + head = iocb-ki_pos;
 + rx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 + kmem_cache_free(net-cache, iocb);
 +
 + /* when log is enabled, recomputing the log info is needed,
 +  * since these buffers are in async queue, and may not get
 +  * the log info before.
 +  */
 + if (unlikely(vq_log)) {
 + if (!log)
 + __vhost_get_vq_desc(net-dev, vq, vq-iov,
 + ARRAY_SIZE(vq-iov),
 + out, in, vq_log,
 + log, head);
 + vhost_log_write(vq, vq_log, log, size);
 + }
 + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
 + vhost_poll_queue(vq-poll);
 + break;
 + }
 + }
 +}
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + int tx_total_len = 0;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + while

Re: [PATCH v4] Add mergeable RX bufs support to vhost

2010-04-22 Thread Michael S. Tsirkin

On Mon, Apr 19, 2010 at 03:12:19PM -0700, David L Stevens wrote:
 This patch adds the mergeable RX buffers feature to vhost.
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

BTW, which userspace should I use for testing this?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] Add mergeable RX bufs support to vhost

2010-04-22 Thread Michael S. Tsirkin

On Thu, Apr 22, 2010 at 11:47:15AM -0600, David Stevens wrote:
 Michael S. Tsirkin m...@redhat.com wrote on 04/22/2010 05:02:25 AM:
 
  On Mon, Apr 19, 2010 at 03:12:19PM -0700, David L Stevens wrote:
   This patch adds the mergeable RX buffers feature to vhost.
   
   Signed-off-by: David L Stevens dlstev...@us.ibm.com
  
  Looks pretty clean to me.
  Could you send a checkpatch-clean version please?
 
 The original passes checkpatch already, but I guess I must
 still be getting whitespace mangling if it didn't for you. (sigh)
 Here it is as an attachment:

No good either. I thought you managed to run sendmail somewhere?
Failing that, put it on dropbox and let me know.
See http://kbase.redhat.com/faq/docs/DOC-2113

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] Add mergeable RX bufs support to vhost

2010-04-22 Thread Michael S. Tsirkin

On Thu, Apr 22, 2010 at 11:59:55AM -0600, David Stevens wrote:
 Michael S. Tsirkin m...@redhat.com wrote on 04/22/2010 10:43:49 AM:
 
  On Mon, Apr 19, 2010 at 03:12:19PM -0700, David L Stevens wrote:
   This patch adds the mergeable RX buffers feature to vhost.
   
   Signed-off-by: David L Stevens dlstev...@us.ibm.com
  
  BTW, which userspace should I use for testing this?
 
 I use the qemu-kvm patch to your git tree that I posted
 a while back (also attached).

Doesn't apply, probably also corrupted.
Could you put it up on dropbox please?

 And it needs your TUNSETVNETHDRSIZE
 ioctl in the kernel as well. Also, I added the TUN ioctl
 defines to /usr/include for this to pick up (it compiles, but
 won't do the ioctl's without that); will get back
 to that patch per your comments next.
 
 [also correction: I said 10-20% guest to host, I meant host-to-guest
 (ie, the receiver side). h2g appears improved too, but not as
 much)]
 
 +-DLS


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv5] add mergeable receiver buffers support to vhost

2010-04-24 Thread Michael S. Tsirkin

On Fri, Apr 23, 2010 at 01:06:45PM -0700, David L Stevens wrote:
 This patch adds mergeable receive buffers support to vhost.
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

It seems the logging is wrong. Did you test live migration? Please do.
I think reason for the bug could be you did some cut and paste
from code that was there before 86e9424d7252bae5ad1c17b4b8088193e6b27cbe.
So I put a suggestion on reducing this duplication a bit, below.

Also, I think this patch adds sparse errors: some __user annotations
seem missing.  Could you please make your patch apply
on top of patch 'vhost: fix sparse warnings' from
Christoph Hellwig, and then make sure your patch
does not add new sparse errors?

I also wanted to make some coding style tweaks, to make
patch match the style of the rest of the code, I could do
them myself but since there's these issues, and we need another
round, I put them in comments in mail below.

Thanks!

 diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v5/drivers/vhost/net.c
 --- net-next-v0/drivers/vhost/net.c   2010-04-22 11:31:57.0 -0700
 +++ net-next-v5/drivers/vhost/net.c   2010-04-22 12:41:17.0 -0700
 @@ -109,7 +109,7 @@ static void handle_tx(struct vhost_net *
   };
   size_t len, total_len = 0;
   int err, wmem;
 - size_t hdr_size;
 + size_t vhost_hlen;
   struct socket *sock = rcu_dereference(vq-private_data);
   if (!sock)
   return;
 @@ -128,13 +128,13 @@ static void handle_tx(struct vhost_net *
  
   if (wmem  sock-sk-sk_sndbuf / 2)
   tx_poll_stop(net);
 - hdr_size = vq-hdr_size;
 + vhost_hlen = vq-vhost_hlen;
  
   for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -  ARRAY_SIZE(vq-iov),
 -  out, in,
 -  NULL, NULL);
 + head = vhost_get_desc(net-dev, vq, vq-iov,
 +   ARRAY_SIZE(vq-iov),
 +   out, in,
 +   NULL, NULL);
   /* Nothing new?  Wait for eventfd to tell us they refilled. */
   if (head == vq-num) {
   wmem = atomic_read(sock-sk-sk_wmem_alloc);
 @@ -155,20 +155,20 @@ static void handle_tx(struct vhost_net *
   break;
   }
   /* Skip header. TODO: support TSO. */
 - s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
 + s = move_iovec_hdr(vq-iov, vq-hdr, vhost_hlen, out);
   msg.msg_iovlen = out;
   len = iov_length(vq-iov, out);
   /* Sanity check */
   if (!len) {
   vq_err(vq, Unexpected header len for TX: 
  %zd expected %zd\n,
 -iov_length(vq-hdr, s), hdr_size);
 +iov_length(vq-hdr, s), vhost_hlen);
   break;
   }
   /* TODO: Check specific error and bomb out unless ENOBUFS? */
   err = sock-ops-sendmsg(NULL, sock, msg, len);
   if (unlikely(err  0)) {
 - vhost_discard_vq_desc(vq);
 + vhost_discard_desc(vq, 1);
   tx_poll_start(net, sock);
   break;
   }
 @@ -187,12 +187,25 @@ static void handle_tx(struct vhost_net *
   unuse_mm(net-dev.mm);
  }
  
 +static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 +{
 + struct sk_buff *head;
 + int len = 0;
 +
 + lock_sock(sk);
 + head = skb_peek(sk-sk_receive_queue);
 + if (head)
 + len = head-len + vq-sock_hlen;
 + release_sock(sk);
 + return len;
 +}
 +
  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_rx(struct vhost_net *net)
  {
   struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_RX];
 - unsigned head, out, in, log, s;
 + unsigned in, log, s;
   struct vhost_log *vq_log;
   struct msghdr msg = {
   .msg_name = NULL,
 @@ -203,14 +216,14 @@ static void handle_rx(struct vhost_net *
   .msg_flags = MSG_DONTWAIT,
   };
  
 - struct virtio_net_hdr hdr = {
 - .flags = 0,
 - .gso_type = VIRTIO_NET_HDR_GSO_NONE
 + struct virtio_net_hdr_mrg_rxbuf hdr = {
 + .hdr.flags = 0,
 + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
   };
  
   size_t len, total_len = 0;
 - int err;
 - size_t hdr_size;
 + int err, headcount, datalen;
 + size_t vhost_hlen;
   struct socket *sock = rcu_dereference(vq-private_data);
   if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
   return;
 @@ -218,18 +231,18 @@ static void handle_rx(struct vhost_net *
   use_mm(net-dev.mm);
   mutex_lock(vq-mutex);

Re: [RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.

2010-04-24 Thread Michael S. Tsirkin

On Fri, Apr 23, 2010 at 03:08:33PM +0800, xiaohui@intel.com wrote:
 From: Xin Xiaohui xiaohui@intel.com
 
 The vhost-net backend now only supports synchronous send/recv
 operations. The patch provides multiple submits and asynchronous
 notifications. This is needed for zero-copy case.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 ---
 
 Michael,
 Can't vhost supply a kiocb completion callback that will handle the list?
 Yes, thanks. And with it I also remove the vq-receivr finally.
 Thanks
 Xiaohui
 
 Nice progress. I commented on some minor issues below.
 Thanks!
 
 The updated patch addressed your comments on the minor issues.
 Thanks!
 
 Thanks
 Xiaohui  
 
  drivers/vhost/net.c   |  236 +++-
  drivers/vhost/vhost.c |  120 ++---
  drivers/vhost/vhost.h |   14 +++
  3 files changed, 314 insertions(+), 56 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 38989d1..18f6c41 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -23,6 +23,8 @@
  #include linux/if_arp.h
  #include linux/if_tun.h
  #include linux/if_macvlan.h
 +#include linux/mpassthru.h
 +#include linux/aio.h
  
  #include net/sock.h
  
 @@ -48,6 +50,7 @@ struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 + struct kmem_cache   *cache;
   /* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
 @@ -92,11 +95,138 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
   net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + unsigned long flags;
 +
 + spin_lock_irqsave(vq-notify_lock, flags);
 + if (!list_empty(vq-notifier)) {
 + iocb = list_first_entry(vq-notifier,
 + struct kiocb, ki_list);
 + list_del(iocb-ki_list);
 + }
 + spin_unlock_irqrestore(vq-notify_lock, flags);
 + return iocb;
 +}
 +
 +static void handle_iocb(struct kiocb *iocb)
 +{
 + struct vhost_virtqueue *vq = iocb-private;
 + unsigned long flags;
 +
 + spin_lock_irqsave(vq-notify_lock, flags);
 + list_add_tail(iocb-ki_list, vq-notifier);
 + spin_unlock_irqrestore(vq-notify_lock, flags);

Don't we need to wake up the wq as well?

 +}
 +
 +static int is_async_vq(struct vhost_virtqueue *vq)
 +{
 + return (vq-link_state == VHOST_VQ_LINK_ASYNC);

() not needed

 +}
 +
 +static void handle_async_rx_events_notify(struct vhost_net *net,
 +   struct vhost_virtqueue *vq,
 +   struct socket *sock)
 +{
 + struct kiocb *iocb = NULL;
 + struct vhost_log *vq_log = NULL;
 + int rx_total_len = 0;
 + unsigned int head, log, in, out;
 + int size;
 +
 + if (!is_async_vq(vq))
 + return;
 +
 + if (sock-sk-sk_data_ready)
 + sock-sk-sk_data_ready(sock-sk, 0);
 +
 + vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
 + vq-log : NULL;
 +
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, iocb-ki_nbytes);
 + size = iocb-ki_nbytes;
 + head = iocb-ki_pos;
 + rx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);

I am confused by the above. Isn't ki_dtor handle_iocb?
Why is it called here?

 + kmem_cache_free(net-cache, iocb);
 +
 + /* when log is enabled, recomputing the log info is needed,
 +  * since these buffers are in async queue, and may not get
 +  * the log info before.
 +  */
 + if (unlikely(vq_log)) {
 + if (!log)

log is uninitialized now?

 + __vhost_get_vq_desc(net-dev, vq, vq-iov,
 + ARRAY_SIZE(vq-iov),
 + out, in, vq_log,
 + log, head);
 + vhost_log_write(vq, vq_log, log, size);
 + }
 + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
 + vhost_poll_queue(vq-poll);
 + break;
 + }
 + }
 +}
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 +   struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + int tx_total_len = 0;
 +
 + if (!is_async_vq(vq))
 + return;
 +
 + while ((iocb = notify_dequeue(vq)) != NULL) {

Please just write this as while (((iocb = notify_dequeue(vq)))
above

Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.

2010-04-25 Thread Michael S. Tsirkin

On Sun, Apr 25, 2010 at 02:55:29AM -0700, David Miller wrote:
 From: xiaohui@intel.com
 Date: Sun, 25 Apr 2010 17:20:06 +0800

  The idea is simple, just to pin the guest VM user space and then let
  host NIC driver has the chance to directly DMA to it.

 Isn't it much easier to map the RX ring of the network device into the
 guest's address space, have DMA map calls translate guest addresses to
 physical/DMA addresses as well as do all of this crazy page pinning
 stuff, and provide the translations and protections via the IOMMU?

This means we need guest know how the specific network device works.
So we won't be able to, for example, move guest between different hosts.
There are other problems: many physical systems do not have an iommu,
some guest OS-es do not support DMA map calls, doing VM exit
on each DMA map call might turn out to be very slow. And so on.

 What's being proposed here looks a bit over-engineered.

This is an attempt to reduce overhead for virtio (paravirtualization).
'Don't use PV' is kind of an alternative, but I do not
think it's a simpler one.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.

2010-04-25 Thread Michael S. Tsirkin

On Sun, Apr 25, 2010 at 05:20:06PM +0800, xiaohui@intel.com wrote:
 We provide an zero-copy method which driver side may get external
 buffers to DMA. Here external means driver don't use kernel space
 to allocate skb buffers. Currently the external buffer can be from
 guest virtio-net driver.
 
 The idea is simple, just to pin the guest VM user space and then
 let host NIC driver has the chance to directly DMA to it. 
 The patches are based on vhost-net backend driver. We add a device
 which provides proto_ops as sendmsg/recvmsg to vhost-net to
 send/recv directly to/from the NIC driver. KVM guest who use the
 vhost-net backend may bind any ethX interface in the host side to
 get copyless data transfer thru guest virtio-net frontend.
 
 patch 01-12:  net core changes.
 patch 13-17:  new device as interface to mantpulate external buffers.
 patch 18: for vhost-net.
 
 The guest virtio-net driver submits multiple requests thru vhost-net
 backend driver to the kernel. And the requests are queued and then
 completed after corresponding actions in h/w are done.
 
 For read, user space buffers are dispensed to NIC driver for rx when
 a page constructor API is invoked. Means NICs can allocate user buffers
 from a page constructor. We add a hook in netif_receive_skb() function
 to intercept the incoming packets, and notify the zero-copy device.
 
 For write, the zero-copy deivce may allocates a new host skb and puts
 payload on the skb_shinfo(skb)-frags, and copied the header to skb-data.
 The request remains pending until the skb is transmitted by h/w.
 
 Here, we have ever considered 2 ways to utilize the page constructor
 API to dispense the user buffers.
 
 One:  Modify __alloc_skb() function a bit, it can only allocate a 
   structure of sk_buff, and the data pointer is pointing to a 
   user buffer which is coming from a page constructor API.
   Then the shinfo of the skb is also from guest.
   When packet is received from hardware, the skb-data is filled
   directly by h/w. What we have done is in this way.
 
   Pros:   We can avoid any copy here.
   Cons:   Guest virtio-net driver needs to allocate skb as almost
   the same method with the host NIC drivers, say the size
   of netdev_alloc_skb() and the same reserved space in the
   head of skb. Many NIC drivers are the same with guest and
   ok for this. But some lastest NIC drivers reserves special
   room in skb head. To deal with it, we suggest to provide
   a method in guest virtio-net driver to ask for parameter
   we interest from the NIC driver when we know which device 
   we have bind to do zero-copy. Then we ask guest to do so.
   Is that reasonable?

Do you still do this?

 Two:  Modify driver to get user buffer allocated from a page constructor
   API(to substitute alloc_page()), the user buffer are used as payload
   buffers and filled by h/w directly when packet is received. Driver
   should associate the pages with skb (skb_shinfo(skb)-frags). For 
   the head buffer side, let host allocates skb, and h/w fills it. 
   After that, the data filled in host skb header will be copied into
   guest header buffer which is submitted together with the payload buffer.
 
   Pros:   We could less care the way how guest or host allocates their
   buffers.
   Cons:   We still need a bit copy here for the skb header.
 
 We are not sure which way is the better here. This is the first thing we want
 to get comments from the community. We wish the modification to the network
 part will be generic which not used by vhost-net backend only, but a user
 application may use it as well when the zero-copy device may provides async
 read/write operations later.

I commented on this in the past. Do you still want comments?

 Please give comments especially for the network part modifications.
 
 
 We provide multiple submits and asynchronous notifiicaton to 
 vhost-net too.
 
 Our goal is to improve the bandwidth and reduce the CPU usage.
 Exact performance data will be provided later. But for simple
 test with netperf, we found bindwidth up and CPU % up too,
 but the bindwidth up ratio is much more than CPU % up ratio.
 
 What we have not done yet:
   packet split support
   To support GRO
   Performance tuning
 
 what we have done in v1:
   polish the RCU usage
   deal with write logging in asynchroush mode in vhost
   add notifier block for mp device
   rename page_ctor to mp_port in netdevice.h to make it looks generic
   add mp_dev_change_flags() for mp device to change NIC state
   add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
   a small fix for missing dev_put when fail
   using dynamic minor instead of static minor number
   a __KERNEL__ protect to mp_get_sock()
 
 what we have done in v2:

[PATCH] qemu-kvm: fix crash on reboot with vhost-net

2010-04-26 Thread Michael S. Tsirkin

When vhost-net is disabled on reboot, we set msix mask notifier
to NULL to disable further mask/unmask notifications.
Code currently tries to pass this NULL to notifier,
leading to a crash.  The right thing to do is:
- if vector is masked, we don't need to notify backend,
  just disable future notifications
- if vector is unmasked, invoke callback to unassign backend,
  then disable future notifications

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

Note: this patch is for qemu-kvm, the code in question
is not in qemu.git.

 hw/msix.c |   15 ---
 1 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/hw/msix.c b/hw/msix.c
index 3ec8805..43361b5 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -613,9 +613,18 @@ int msix_set_mask_notifier(PCIDevice *dev, unsigned 
vector, void *opaque)
 if (vector = dev-msix_entries_nr || !dev-msix_entry_used[vector])
 return 0;
 
-if (dev-msix_mask_notifier)
-r = dev-msix_mask_notifier(dev, vector, opaque,
-msix_is_masked(dev, vector));
+if (dev-msix_mask_notifier  !msix_is_masked(dev, vector)) {
+/* Switching notifiers while vector is unmasked:
+ * mask the old one, unmask the new one. */
+if (dev-msix_mask_notifier_opaque[vector]) {
+r = dev-msix_mask_notifier(dev, vector,
+dev-msix_mask_notifier_opaque[vector],
+1);
+}
+if (r = 0  opaque) {
+r = dev-msix_mask_notifier(dev, vector, opaque, 0);
+}
+}
 if (r = 0)
 dev-msix_mask_notifier_opaque[vector] = opaque;
 return r;
-- 
1.7.1.rc1.22.g3163
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6] Add mergeable rx buffer support to vhost_net

2010-04-26 Thread Michael S. Tsirkin

On Mon, Apr 26, 2010 at 02:20:52PM -0700, David L Stevens wrote:
 This patch adds mergeable receive buffer support to vhost_net.
 
   +-DLS
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

OK, looks good. I still think iovec handling is a bit off,
as commented below.

 diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v6/drivers/vhost/net.c
 --- net-next-v0/drivers/vhost/net.c   2010-04-24 21:36:54.0 -0700
 +++ net-next-v6/drivers/vhost/net.c   2010-04-26 01:13:04.0 -0700
 @@ -109,7 +109,7 @@ static void handle_tx(struct vhost_net *
   };
   size_t len, total_len = 0;
   int err, wmem;
 - size_t hdr_size;
 + size_t vhost_hlen;
   struct socket *sock = rcu_dereference(vq-private_data);
   if (!sock)
   return;
 @@ -128,13 +128,13 @@ static void handle_tx(struct vhost_net *
  
   if (wmem  sock-sk-sk_sndbuf / 2)
   tx_poll_stop(net);
 - hdr_size = vq-hdr_size;
 + vhost_hlen = vq-vhost_hlen;
  
   for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -  ARRAY_SIZE(vq-iov),
 -  out, in,
 -  NULL, NULL);
 + head = vhost_get_desc(net-dev, vq, vq-iov,
 +   ARRAY_SIZE(vq-iov),
 +   out, in,
 +   NULL, NULL);
   /* Nothing new?  Wait for eventfd to tell us they refilled. */
   if (head == vq-num) {
   wmem = atomic_read(sock-sk-sk_wmem_alloc);
 @@ -155,20 +155,20 @@ static void handle_tx(struct vhost_net *
   break;
   }
   /* Skip header. TODO: support TSO. */
 - s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
 + s = move_iovec_hdr(vq-iov, vq-hdr, vhost_hlen, out);
   msg.msg_iovlen = out;
   len = iov_length(vq-iov, out);
   /* Sanity check */
   if (!len) {
   vq_err(vq, Unexpected header len for TX: 
  %zd expected %zd\n,
 -iov_length(vq-hdr, s), hdr_size);
 +iov_length(vq-hdr, s), vhost_hlen);
   break;
   }
   /* TODO: Check specific error and bomb out unless ENOBUFS? */
   err = sock-ops-sendmsg(NULL, sock, msg, len);
   if (unlikely(err  0)) {
 - vhost_discard_vq_desc(vq);
 + vhost_discard_desc(vq, 1);
   tx_poll_start(net, sock);
   break;
   }
 @@ -187,12 +187,25 @@ static void handle_tx(struct vhost_net *
   unuse_mm(net-dev.mm);
  }
  
 +static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 +{
 + struct sk_buff *head;
 + int len = 0;
 +
 + lock_sock(sk);
 + head = skb_peek(sk-sk_receive_queue);
 + if (head)
 + len = head-len + vq-sock_hlen;
 + release_sock(sk);
 + return len;
 +}
 +
  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_rx(struct vhost_net *net)
  {
   struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_RX];
 - unsigned head, out, in, log, s;
 + unsigned in, log, s;
   struct vhost_log *vq_log;
   struct msghdr msg = {
   .msg_name = NULL,
 @@ -203,14 +216,14 @@ static void handle_rx(struct vhost_net *
   .msg_flags = MSG_DONTWAIT,
   };
  
 - struct virtio_net_hdr hdr = {
 - .flags = 0,
 - .gso_type = VIRTIO_NET_HDR_GSO_NONE
 + struct virtio_net_hdr_mrg_rxbuf hdr = {
 + .hdr.flags = 0,
 + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
   };
  
   size_t len, total_len = 0;
 - int err;
 - size_t hdr_size;
 + int err, headcount, datalen;
 + size_t vhost_hlen;
   struct socket *sock = rcu_dereference(vq-private_data);
   if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
   return;
 @@ -218,18 +231,19 @@ static void handle_rx(struct vhost_net *
   use_mm(net-dev.mm);
   mutex_lock(vq-mutex);
   vhost_disable_notify(vq);
 - hdr_size = vq-hdr_size;
 + vhost_hlen = vq-vhost_hlen;
  
   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
   vq-log : NULL;
  
 - for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -  ARRAY_SIZE(vq-iov),
 -  out, in,
 -  vq_log, log);
 + while ((datalen = vhost_head_len(vq, sock-sk))) {
 + headcount = vhost_get_desc_n(vq, vq-heads,
 +  datalen + vhost_hlen,
 +

[PATCHv2] qemu-kvm: fix crash on reboot with vhost-net

2010-04-28 Thread Michael S. Tsirkin

When vhost-net is disabled on reboot, we set msix mask notifier
to NULL to disable further mask/unmask notifications.
Code currently tries to pass this NULL to notifier,
leading to a crash.  The right thing to do is
to add explicit APIs to enable/disable notifications.
Now when disabling notifications:
- if vector is masked, we don't need to notify backend,
  just disable future notifications
- if vector is unmasked, invoke callback to unassign backend,
  then disable future notifications

This patch also polls notifier before closing it,
to make sure we don't lose events if poll callback
didn't have time to run.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

Changes from v1:
Separate APIs to set and unset notifiers
Test and clear notifier before destroying it

 hw/msix.c   |   40 +++-
 hw/msix.h   |1 +
 hw/virtio-pci.c |7 +--
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/hw/msix.c b/hw/msix.c
index 3ec8805..8f9a621 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -609,14 +609,44 @@ void msix_unuse_all_vectors(PCIDevice *dev)
 
 int msix_set_mask_notifier(PCIDevice *dev, unsigned vector, void *opaque)
 {
+int r;
+if (vector = dev-msix_entries_nr || !dev-msix_entry_used[vector])
+return 0;
+
+assert(dev-msix_mask_notifier);
+assert(opaque);
+assert(!dev-msix_mask_notifier_opaque[vector]);
+
+if (msix_is_masked(dev, vector)) {
+return 0;
+}
+r = dev-msix_mask_notifier(dev, vector, opaque,
+msix_is_masked(dev, vector));
+if (r  0) {
+return r;
+}
+dev-msix_mask_notifier_opaque[vector] = opaque;
+return r;
+}
+
+int msix_unset_mask_notifier(PCIDevice *dev, unsigned vector)
+{
 int r = 0;
 if (vector = dev-msix_entries_nr || !dev-msix_entry_used[vector])
 return 0;
 
-if (dev-msix_mask_notifier)
-r = dev-msix_mask_notifier(dev, vector, opaque,
-msix_is_masked(dev, vector));
-if (r = 0)
-dev-msix_mask_notifier_opaque[vector] = opaque;
+assert(dev-msix_mask_notifier);
+assert(dev-msix_mask_notifier_opaque[vector]);
+
+if (msix_is_masked(dev, vector)) {
+return 0;
+}
+r = dev-msix_mask_notifier(dev, vector,
+dev-msix_mask_notifier_opaque[vector],
+msix_is_masked(dev, vector));
+if (r  0) {
+return r;
+}
+dev-msix_mask_notifier_opaque[vector] = NULL;
 return r;
 }
diff --git a/hw/msix.h b/hw/msix.h
index f167231..6b21ffb 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -34,4 +34,5 @@ void msix_reset(PCIDevice *dev);
 extern int msix_supported;
 
 int msix_set_mask_notifier(PCIDevice *dev, unsigned vector, void *opaque);
+int msix_unset_mask_notifier(PCIDevice *dev, unsigned vector);
 #endif
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 99a588c..c4bc633 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -462,10 +462,13 @@ static int virtio_pci_set_guest_notifier(void *opaque, 
int n, bool assign)
 msix_set_mask_notifier(proxy-pci_dev,
virtio_queue_vector(proxy-vdev, n), vq);
 } else {
-msix_set_mask_notifier(proxy-pci_dev,
-   virtio_queue_vector(proxy-vdev, n), NULL);
+msix_unset_mask_notifier(proxy-pci_dev,
+virtio_queue_vector(proxy-vdev, n));
 qemu_set_fd_handler(event_notifier_get_fd(notifier),
 NULL, NULL, NULL);
+/* Test and clear notifier before closing it,
+ * in case poll callback didn't have time to run. */
+virtio_pci_guest_notifier_read(vq);
 event_notifier_cleanup(notifier);
 }
 
-- 
1.7.1.rc1.22.g3163
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] virtio-spec: document block CMD and FLUSH

2010-04-28 Thread Michael S. Tsirkin

On Tue, Apr 20, 2010 at 12:26:27AM +0300, Michael S. Tsirkin wrote:
 On Fri, Feb 19, 2010 at 12:22:20AM +0200, Michael S. Tsirkin wrote:
  I took a stub at documenting CMD and FLUSH request types in virtio
  block.
 
 Any comments?

Rusty?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv2] qemu-kvm: fix crash on reboot with vhost-net

2010-04-28 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 02:43:05PM -0300, Marcelo Tosatti wrote:
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
 
 Applied, thanks.

Can you tell me what commit id it has pls (for backport to rhel6).
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RHEL6.0 PATCH] qemu-kvm: fix crash on reboot with vhost-net

2010-04-28 Thread Michael S. Tsirkin

Bugzilla: 585940
Upstream status: applied on qemu-kvm.git,
commit 992cc816c42f2e93db033919a9ddbfcd1da4

When vhost-net is disabled on reboot, we set msix mask notifier
to NULL to disable further mask/unmask notifications.
Code currently tries to pass this NULL to notifier,
leading to a crash.  The right thing to do is
to add explicit APIs to enable/disable notifications.
Now when disabling notifications:
- if vector is masked, we don't need to notify backend,
  just disable future notifications
- if vector is unmasked, invoke callback to unassign backend,
  then disable future notifications

This patch also polls notifier before closing it,
to make sure we don't lose events if poll callback
didn't have time to run.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/msix.c   |   40 +++-
 hw/msix.h   |1 +
 hw/virtio-pci.c |7 +--
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/hw/msix.c b/hw/msix.c
index 3fcf3a1..94e3981 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -610,14 +610,44 @@ void msix_unuse_all_vectors(PCIDevice *dev)
 
 int msix_set_mask_notifier(PCIDevice *dev, unsigned vector, void *opaque)
 {
+int r;
+if (vector = dev-msix_entries_nr || !dev-msix_entry_used[vector])
+return 0;
+
+assert(dev-msix_mask_notifier);
+assert(opaque);
+assert(!dev-msix_mask_notifier_opaque[vector]);
+
+if (msix_is_masked(dev, vector)) {
+return 0;
+}
+r = dev-msix_mask_notifier(dev, vector, opaque,
+msix_is_masked(dev, vector));
+if (r  0) {
+return r;
+}
+dev-msix_mask_notifier_opaque[vector] = opaque;
+return r;
+}
+
+int msix_unset_mask_notifier(PCIDevice *dev, unsigned vector)
+{
 int r = 0;
 if (vector = dev-msix_entries_nr || !dev-msix_entry_used[vector])
 return 0;
 
-if (dev-msix_mask_notifier)
-r = dev-msix_mask_notifier(dev, vector, opaque,
-msix_is_masked(dev, vector));
-if (r = 0)
-dev-msix_mask_notifier_opaque[vector] = opaque;
+assert(dev-msix_mask_notifier);
+assert(dev-msix_mask_notifier_opaque[vector]);
+
+if (msix_is_masked(dev, vector)) {
+return 0;
+}
+r = dev-msix_mask_notifier(dev, vector,
+dev-msix_mask_notifier_opaque[vector],
+msix_is_masked(dev, vector));
+if (r  0) {
+return r;
+}
+dev-msix_mask_notifier_opaque[vector] = NULL;
 return r;
 }
diff --git a/hw/msix.h b/hw/msix.h
index f167231..6b21ffb 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -34,4 +34,5 @@ void msix_reset(PCIDevice *dev);
 extern int msix_supported;
 
 int msix_set_mask_notifier(PCIDevice *dev, unsigned vector, void *opaque);
+int msix_unset_mask_notifier(PCIDevice *dev, unsigned vector);
 #endif
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index cba188c..22f7fa0 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -437,10 +437,13 @@ static int virtio_pci_guest_notifier(void *opaque, int n, 
bool assign)
 msix_set_mask_notifier(proxy-pci_dev,
virtio_queue_vector(proxy-vdev, n), vq);
 } else {
-msix_set_mask_notifier(proxy-pci_dev,
-   virtio_queue_vector(proxy-vdev, n), NULL);
+msix_unset_mask_notifier(proxy-pci_dev,
+virtio_queue_vector(proxy-vdev, n));
 qemu_set_fd_handler(event_notifier_get_fd(notifier),
 NULL, NULL, NULL);
+/* Test and clear notifier before closing it,
+ * in case poll callback didn't have time to run. */
+virtio_pci_guest_notifier_read(vq);
 event_notifier_cleanup(notifier);
 }
 
-- 
1.7.1.rc1.22.g3163
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RHEL6.0 PATCH] qemu-kvm: fix crash on reboot with vhost-net

2010-04-28 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 11:16:15PM +0300, Michael S. Tsirkin wrote:
 Bugzilla: 585940
 Upstream status: applied on qemu-kvm.git,
 commit 992cc816c42f2e93db033919a9ddbfcd1da4

Please disregard, sent to this list in error.
Sorry about the noise.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V3] drivers/uio/uio_pci_generic.c: allow access for non-privileged processes

2010-04-30 Thread Michael S. Tsirkin

On Thu, Apr 29, 2010 at 12:29:40PM -0700, Tom Lyon wrote:
 Michael, et al - sorry for the delay, but I've been digesting the comments 
 and researching new approaches.
 
 I think the plan for V4 will be to take things entirely out of the UIO 
 framework, and instead have a driver which supports user mode use of 
 well-behaved PCI devices. I would like to use read and write to support 
 access to memory regions, IO regions,  or PCI config space. Config space is a 
 bitch because not everything is safe to read or write, but I've come up with 
 a table driven approach which can be run-time extended for non-compliant 
 devices (under root control) which could then enable non-privileged users. 
 For instance, OHCI 1394 devices use a dword in config space which is not 
 formatted as a PCI capability, root can use sysfs to enable access:
   echo offset readbits writebits  
 /sys/dev/pci/devices/:xx:xx.x/yyy/config_permit
 
 
 A well-behaved PCI device must have memory BARs = 4K for mmaping, must 
 have separate memory space for MSI-X that does not need mmaping
 by the user driver, must support the PCI 2.3 interrupt masking, and must not 
 go totally crazy with PCI config space (tg3 is real ugly, e1000 is fine).

e1000 has a good driver in kernel, though.

 
 Again, my primary usage model is for direct user-level access to network 
 devices, not for virtualization, but I think both will work.

I suspect that without mmap and (to a lesser extent) write-combining,
this would be pretty useless for virtualization.

 So, I will go outside UIO because:
 1 - it doesn't allow reads and writes to sub-drivers, just irqcontrol
 2 - it doesn't have ioctls
 3 - it has its own interrupt model which doesn't use eventfds
 4 - it's ugly doing the new stuff and maintaining backwards compat.
 
 I hereby solicit comments on the name and location for the new driver.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-04-30 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
 This patch adds mergeable receive buffer support to vhost_net.
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

I have applied this, thanks very much!
I have also applied some tweaks on top,
please take a look.

Thanks,
MSt

commit 2809e94f5f26d89dc5232aaec753ffda95c4d95e
Author: Michael S. Tsirkin m...@redhat.com
Date:   Thu Apr 29 16:18:08 2010 +0300

vhost-net: minor tweaks in mergeable buffer code

Applies the following tweaks on top of mergeable buffers patch:
1. vhost_get_desc_n assumes that all desriptors are 'in' only.
   It's also unlikely to be useful for any vhost frontend
   besides vhost_net, so just move it to net.c, and rename
   get_rx_bufs for brevity.

2. for rx, we called iov_length within vhost_get_desc_n
   (now get_rx_bufs) already, so we don't
   need an extra call to iov_length to avoid overflow anymore.
   Accordingly, copy_iovec_hdr can return void now.

3. for rx, do some further code tweaks:
   do not assign len = err as we check that err == len
   handle data length in a way similar to how we handle
   header length: datalen - sock_len, len - vhost_len.
   add sock_hlen as a local variable, for symmetry with vhost_hlen.

4. add some likely/unlikely annotations

Signed-off-by: Michael S. Tsirkin m...@redhat.com

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d61d945..85519b4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,9 +74,9 @@ static int move_iovec_hdr(struct iovec *from, struct iovec 
*to,
}
return seg;
 }
-/* Copy iovec entries for len bytes from iovec. Return segments used. */
-static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
- size_t len, int iovcount)
+/* Copy iovec entries for len bytes from iovec. */
+static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
+  size_t len, int iovcount)
 {
int seg = 0;
size_t size;
@@ -89,7 +89,6 @@ static int copy_iovec_hdr(const struct iovec *from, struct 
iovec *to,
++to;
++seg;
}
-   return seg;
 }
 
 /* Caller must have TX VQ lock */
@@ -204,7 +203,7 @@ static void handle_tx(struct vhost_net *net)
unuse_mm(net-dev.mm);
 }
 
-static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
+static int peek_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 {
struct sk_buff *head;
int len = 0;
@@ -212,17 +211,70 @@ static int vhost_head_len(struct vhost_virtqueue *vq, 
struct sock *sk)
lock_sock(sk);
head = skb_peek(sk-sk_receive_queue);
if (head)
-   len = head-len + vq-sock_hlen;
+   len = head-len;
release_sock(sk);
return len;
 }
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ * vq has read descriptors only.
+ * @vq - the relevant virtqueue
+ * @datalen- data length we'll be reading
+ * @iovcount   - returned count of io vectors we fill
+ * @log- vhost log
+ * @log_num- log offset
+ * returns number of buffer heads allocated, negative on error
+ */
+static int get_rx_bufs(struct vhost_virtqueue *vq,
+  struct vring_used_elem *heads,
+  int datalen,
+  unsigned *iovcount,
+  struct vhost_log *log,
+  unsigned *log_num)
+{
+   unsigned int out, in;
+   int seg = 0;
+   int headcount = 0;
+   unsigned d;
+   int r;
+
+   while (datalen  0) {
+   if (unlikely(headcount = VHOST_NET_MAX_SG)) {
+   r = -ENOBUFS;
+   goto err;
+   }
+   d = vhost_get_desc(vq-dev, vq, vq-iov + seg,
+  ARRAY_SIZE(vq-iov) - seg, out,
+  in, log, log_num);
+   if (d == vq-num) {
+   r = 0;
+   goto err;
+   }
+   if (unlikely(out || in = 0)) {
+   vq_err(vq, unexpected descriptor format for RX: 
+   out %d, in %d\n, out, in);
+   r = -EINVAL;
+   goto err;
+   }
+   heads[headcount].id = d;
+   heads[headcount].len = iov_length(vq-iov + seg, in);
+   datalen -= heads[headcount].len;
+   ++headcount;
+   seg += in;
+   }
+   *iovcount = seg;
+   return headcount;
+err:
+   vhost_discard_desc(vq, headcount);
+   return r;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
struct vhost_virtqueue *vq = net-dev.vqs

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-04-30 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
 This patch adds mergeable receive buffer support to vhost_net.
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

You can find the latest version on the following net-next based tree:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-05-03 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
 This patch adds mergeable receive buffer support to vhost_net.
 
 Signed-off-by: David L Stevens dlstev...@us.ibm.com

I've been doing some more testing before sending out a pull
request, and I see a drastic performance degradation in guest to host
traffic when this is applied but mergeable buffers are not in used
by userspace (existing qemu-kvm userspace).

This is both with and without my patch on top.

Without patch:
[...@tuck ~]$ sh runtest  21 | tee 
ser-meregeable-disabled-kernel-only-tun-only.log
Starting netserver at port 12865
set_up_server could not establish a listen endpoint for  port 12865 with family 
AF_UNSPEC
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.0.0.4 (11.0.0.4) 
port 0 AF_INET : demo
Recv   SendSend  Utilization   Service Demand
Socket Socket  Message  Elapsed  Send Recv SendRecv
Size   SizeSize Time Throughput  localremote   local   remote
bytes  bytes   bytessecs.10^6bits/s  % S  % S  us/KB   us/KB

 87380  16384  1638410.00  9107.26   89.2033.850.802   2.436  

With patch:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.0.0.4 (11.0.0.4) 
port 0 AF_INET : demo
Recv   SendSend  Utilization   Service Demand
Socket Socket  Message  Elapsed  Send Recv SendRecv
Size   SizeSize Time Throughput  localremote   local   remote
bytes  bytes   bytessecs.10^6bits/s  % S  % S  us/KB   us/KB

 87380  16384  1638410.0035.00   2.21 0.62 5.181   11.575 


For ease of testing, I put this on my tree
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost-broken

Please take a look.
Thanks!

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-05-03 Thread Michael S. Tsirkin

On Mon, May 03, 2010 at 08:39:08AM -0700, David Stevens wrote:
 Michael S. Tsirkin m...@redhat.com wrote on 05/03/2010 03:34:11 AM:
 
  On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
   This patch adds mergeable receive buffer support to vhost_net.
   
   Signed-off-by: David L Stevens dlstev...@us.ibm.com
  
  I've been doing some more testing before sending out a pull
  request, and I see a drastic performance degradation in guest to host
  traffic when this is applied but mergeable buffers are not in used
  by userspace (existing qemu-kvm userspace).
 
 Actually, I wouldn't expect it to work at all;
 the qemu-kvm
 patch (particularly the feature bit setting bug fix) is required.

Which bugfix is that?

 Without it, I think the existing code will tell the guest to use
 mergeable buffers while turning it off in vhost.

It should not do that, specifically
vhost_net_get_features does:
features = ~(1  VIRTIO_NET_F_MRG_RXBUF);
unconditionally. This was added with:
5751995a20e77cd9d61d00f7390401895fa172a6

I forced mergeable buffers off with -global virtio-net-pci.mrg_rxbuf=off
and it did not seem to help either.

 That was completely
 non-functional for me -- what version of qemu-kvm are you using?

992cc816c42f2e93db033919a9ddbfcd1da4 or later should work well
AFAIK.

 What I did to test w/o mergeable buffers is turn off the
 bit in VHOST_FEATURES.

It should be enough to force mergeable buffers to off by qemu command
line: -global virtio-net-pci.mrg_rxbuf=off 

 I'll recheck these, but qemu-kvm definitely
 must be updated; the original doesn't correctly handle feature bits.
 
 +-DLS

Hmm, I don't see the bug.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL] first round of vhost-net enhancements for net-next

2010-05-03 Thread Michael S. Tsirkin

David,
The following tree includes a couple of enhancements that help vhost-net.
Please pull them for net-next. Another set of patches is under
debugging/testing and I hope to get them ready in time for 2.6.35,
so there may be another pull request later.

Thanks!

The following changes since commit 7ef527377b88ff05fb122a47619ea506c631c914:

  Merge branch 'master' of 
master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6 (2010-05-02 22:02:06 
-0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

Michael S. Tsirkin (2):
  tun: add ioctl to modify vnet header size
  macvtap: add ioctl to modify vnet header size

 drivers/net/macvtap.c  |   31 +++
 drivers/net/tun.c  |   32 
 include/linux/if_tun.h |2 ++
 3 files changed, 57 insertions(+), 8 deletions(-)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [GIT PULL] first round of vhost-net enhancements for net-next

2010-05-03 Thread Michael S. Tsirkin

On Mon, May 03, 2010 at 03:08:29PM -0700, David Miller wrote:
 From: David Miller da...@davemloft.net
 Date: Mon, 03 May 2010 15:07:29 -0700 (PDT)

  From: Michael S. Tsirkin m...@redhat.com
  Date: Tue, 4 May 2010 00:32:45 +0300

  The following tree includes a couple of enhancements that help vhost-net.
  Please pull them for net-next. Another set of patches is under
  debugging/testing and I hope to get them ready in time for 2.6.35,
  so there may be another pull request later.

  Pulled, thanks.

 Nevermind, reverted.

 Do you even compile test what you send to people?

 drivers/net/macvtap.c: In function ‘macvtap_ioctl’:
 drivers/net/macvtap.c:713: warning: control reaches end of non-void function

 You're really batting 1000 today Michael...

Ouch. Should teach me not to send out stuff after midnight. Sorry about
the noise.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL] amended: first round of vhost-net enhancements for net-next

2010-05-04 Thread Michael S. Tsirkin

David,
This is an amended pull request: I have rebased the tree to the
correct patches. This has been through basic tests and seems
to work fine here.

The following tree includes a couple of enhancements that help vhost-net.
Please pull them for net-next. Another set of patches is under
debugging/testing and I hope to get them ready in time for 2.6.35,
so there may be another pull request later.

Thanks!

The following changes since commit 7ef527377b88ff05fb122a47619ea506c631c914:

  Merge branch 'master' of 
master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6 (2010-05-02 22:02:06 
-0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

Michael S. Tsirkin (2):
  tun: add ioctl to modify vnet header size
  macvtap: add ioctl to modify vnet header size

 drivers/net/macvtap.c  |   27 +++
 drivers/net/tun.c  |   32 
 include/linux/if_tun.h |2 ++
 3 files changed, 53 insertions(+), 8 deletions(-)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: virtio: put last_used and last_avail index into ring itself.

2010-05-04 Thread Michael S. Tsirkin

 virtio: put last_used and last_avail index into ring itself.
 
 Generally, the other end of the virtio ring doesn't need to see where
 you're up to in consuming the ring.  However, to completely understand
 what's going on from the outside, this information must be exposed.
 For example, if you want to save and restore a virtio_ring, but you're
 not the consumer because the kernel is using it directly.
 
 Fortunately, we have room to expand: the ring is always a whole number
 of pages and there's hundreds of bytes of padding after the avail ring
 and the used ring, whatever the number of descriptors (which must be a
 power of 2).
 
 We add a feature bit so the guest can tell the host that it's writing
 out the current value there, if it wants to use that.
 
 Signed-off-by: Rusty Russell ru...@rustcorp.com.au

I've been looking at this patch some more (more on why
later), and I wonder: would it be better to add some
alignment to the last used index address, so that
if we later add more stuff at the tail, it all
fits in a single cache line?

We use a new feature bit anyway, so layout change should not be
a problem.

Since I raised the question of caches: for used ring,
the ring is not aligned to 64 bit, so on CPUs with 64 bit
or larger cache lines, used entries will often cross
cache line boundaries. Am I right and might it
have been better to align ring entries to cache line boundaries?

What do you think?

 ---
  drivers/virtio/virtio_ring.c |   23 +++
  include/linux/virtio_ring.h  |   12 +++-
  2 files changed, 26 insertions(+), 9 deletions(-)
 
 diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
 --- a/drivers/virtio/virtio_ring.c
 +++ b/drivers/virtio/virtio_ring.c
 @@ -71,9 +71,6 @@ struct vring_virtqueue
   /* Number we've added since last sync. */
   unsigned int num_added;
  
 - /* Last used index we've seen. */
 - u16 last_used_idx;
 -
   /* How to notify other side. FIXME: commonalize hcalls! */
   void (*notify)(struct virtqueue *vq);
  
 @@ -278,12 +275,13 @@ static void detach_buf(struct vring_virt
  
  static inline bool more_used(const struct vring_virtqueue *vq)
  {
 - return vq-last_used_idx != vq-vring.used-idx;
 + return vring_last_used(vq-vring) != vq-vring.used-idx;
  }
  
  static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
  {
   struct vring_virtqueue *vq = to_vvq(_vq);
 + struct vring_used_elem *u;
   void *ret;
   unsigned int i;
  
 @@ -300,8 +298,11 @@ static void *vring_get_buf(struct virtqu
   return NULL;
   }
  
 - i = vq-vring.used-ring[vq-last_used_idx%vq-vring.num].id;
 - *len = vq-vring.used-ring[vq-last_used_idx%vq-vring.num].len;
 + u = vq-vring.used-ring[vring_last_used(vq-vring) % vq-vring.num];
 + i = u-id;
 + *len = u-len;
 + /* Make sure we don't reload i after doing checks. */
 + rmb();
  
   if (unlikely(i = vq-vring.num)) {
   BAD_RING(vq, id %u out of range\n, i);
 @@ -315,7 +316,8 @@ static void *vring_get_buf(struct virtqu
   /* detach_buf clears data, so grab it now. */
   ret = vq-data[i];
   detach_buf(vq, i);
 - vq-last_used_idx++;
 + vring_last_used(vq-vring)++;
 +
   END_USE(vq);
   return ret;
  }
 @@ -402,7 +404,6 @@ struct virtqueue *vring_new_virtqueue(un
   vq-vq.name = name;
   vq-notify = notify;
   vq-broken = false;
 - vq-last_used_idx = 0;
   vq-num_added = 0;
   list_add_tail(vq-vq.list, vdev-vqs);
  #ifdef DEBUG
 @@ -413,6 +414,10 @@ struct virtqueue *vring_new_virtqueue(un
  
   vq-indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
  
 + /* We publish indices whether they offer it or not: if not, it's junk
 +  * space anyway.  But calling this acknowledges the feature. */
 + virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_INDICES);
 +
   /* No callback?  Tell other side not to bother us. */
   if (!callback)
   vq-vring.avail-flags |= VRING_AVAIL_F_NO_INTERRUPT;
 @@ -443,6 +448,8 @@ void vring_transport_features(struct vir
   switch (i) {
   case VIRTIO_RING_F_INDIRECT_DESC:
   break;
 + case VIRTIO_RING_F_PUBLISH_INDICES:
 + break;
   default:
   /* We don't understand this bit. */
   clear_bit(i, vdev-features);
 diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
 --- a/include/linux/virtio_ring.h
 +++ b/include/linux/virtio_ring.h
 @@ -29,6 +29,9 @@
  /* We support indirect buffer descriptors */
  #define VIRTIO_RING_F_INDIRECT_DESC  28
  
 +/* We publish our last-seen used index at the end of the avail ring. */
 +#define VIRTIO_RING_F_PUBLISH_INDICES29
 +
  /* Virtio ring descriptors: 16 bytes.  These can chain together via next. 
 */
  struct vring_desc
  {
 @@ -87,6 +90,7 @@ struct vring {
   *   __u16

Re: [PATCH] virtio-spec: document block CMD and FLUSH

2010-05-04 Thread Michael S. Tsirkin

On Tue, May 04, 2010 at 08:54:59PM +0200, Christoph Hellwig wrote:
 On Fri, Feb 19, 2010 at 12:22:20AM +0200, Michael S. Tsirkin wrote:
  I took a stub at documenting CMD and FLUSH request types in virtio
  block.  Christoph, could you look over this please?
  
  I note that the interface seems full of warts to me,
  this might be a first step to cleaning them.
 
 The whole virtio-blk interface is full of warts.  It has been
 extended rather ad-hoc, so that is rather expected.
 
  One issue I struggled with especially is how type
  field mixes bits and non-bit values. I ended up
  simply defining all legal values, so that we have
  CMD = 2, CMD_OUT = 3 and so on.
 
 It's basically a complete mess without much logic behind it.
 
  +\change_unchanged
  +the high bit
  +\change_inserted 0 1266497301
  + (VIRTIO_BLK_T_BARRIER)
  +\change_unchanged
  + indicates that this request acts as a barrier and that all preceeding 
  requests
  + must be complete before this one, and all following requests must not be
  + started until this is complete.
  +
  +\change_inserted 0 1266504385
  + Note that a barrier does not flush caches in the underlying backend device
  + in host, and thus does not serve as data consistency guarantee.
  + Driver must use FLUSH request to flush the host cache.
  +\change_unchanged
 
 I'm not sure it's even worth documenting it.  I can't see any way to
 actually implement safe behaviour with the VIRTIO_BLK_T_BARRIER-style
 barriers.

lguest seems to still use this.
I guess if you have a reliable host, VIRTIO_BLK_T_BARRIER is enough?

 Btw, did I mention that .lyx is a a really horrible format to review
 diffs for?  Plain latex would be a lot better..
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] virtio-spec: document block CMD and FLUSH

2010-05-04 Thread Michael S. Tsirkin

On Tue, May 04, 2010 at 09:56:18PM +0300, Michael S. Tsirkin wrote:
 On Tue, May 04, 2010 at 08:54:59PM +0200, Christoph Hellwig wrote:
  On Fri, Feb 19, 2010 at 12:22:20AM +0200, Michael S. Tsirkin wrote:
   I took a stub at documenting CMD and FLUSH request types in virtio
   block.  Christoph, could you look over this please?
   
   I note that the interface seems full of warts to me,
   this might be a first step to cleaning them.
  
  The whole virtio-blk interface is full of warts.  It has been
  extended rather ad-hoc, so that is rather expected.
  
   One issue I struggled with especially is how type
   field mixes bits and non-bit values. I ended up
   simply defining all legal values, so that we have
   CMD = 2, CMD_OUT = 3 and so on.
  
  It's basically a complete mess without much logic behind it.
  
   +\change_unchanged
   +the high bit
   +\change_inserted 0 1266497301
   + (VIRTIO_BLK_T_BARRIER)
   +\change_unchanged
   + indicates that this request acts as a barrier and that all preceeding 
   requests
   + must be complete before this one, and all following requests must not be
   + started until this is complete.
   +
   +\change_inserted 0 1266504385
   + Note that a barrier does not flush caches in the underlying backend 
   device
   + in host, and thus does not serve as data consistency guarantee.
   + Driver must use FLUSH request to flush the host cache.
   +\change_unchanged
  
  I'm not sure it's even worth documenting it.  I can't see any way to
  actually implement safe behaviour with the VIRTIO_BLK_T_BARRIER-style
  barriers.
 
 lguest seems to still use this.

Sorry, it doesn't. No idea why I thought it does.

 I guess if you have a reliable host, VIRTIO_BLK_T_BARRIER is enough?
 
  Btw, did I mention that .lyx is a a really horrible format to review
  diffs for?  Plain latex would be a lot better..
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH] virtio-spec: document block CMD and FLUSH

2010-05-04 Thread Michael S. Tsirkin

On Tue, May 04, 2010 at 08:56:14PM +0200, Christoph Hellwig wrote:
 On Tue, Apr 20, 2010 at 02:46:35AM +0100, Jamie Lokier wrote:
  Does this mean that virtio-blk supports all three combinations?
  
 1. FLUSH that isn't a barrier
 2. FLUSH that is also a barrier
 3. Barrier that is not a flush
  
  1 is good for fsync-like operations;
  2 is good for journalling-like ordered operations.
  3 sounds like it doesn't mean a lot as the host cache provides no
  guarantees and has no ordering facility that can be used.
 
 No.  The Linux virtio_blk guest driver either supports data integrity
 by using FLUSH or can send down BARRIER requests which aren't much
 help at all.

It seems we use BARRIER when we get REQ_HARDBARRIER, right?
What does the REQ_HARDBARRIER flag in request mean and when is it set?

  Qemu only implements FLUSH anyway.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

question on virtio

2010-05-05 Thread Michael S. Tsirkin

Hi!
I see this in virtio_ring.c:

/* Put entry in available array (but don't update avail-idx *
   until they do sync). */

Why is it done this way?
It seems that updating the index straight away would be simpler, while
this might allow the host to specilatively look up the buffer and handle
it, without waiting for the kick.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC] virtio: put last seen used index into ring itself

2010-05-05 Thread Michael S. Tsirkin

Generally, the Host end of the virtio ring doesn't need to see where
Guest is up to in consuming the ring.  However, to completely understand
what's going on from the outside, this information must be exposed.
For example, host can reduce the number of interrupts by detecting
that the guest is currently handling previous buffers.

Fortunately, we have room to expand: the ring is always a whole number
of pages and there's hundreds of bytes of padding after the avail ring
and the used ring, whatever the number of descriptors (which must be a
power of 2).

We add a feature bit so the guest can tell the host that it's writing
out the current value there, if it wants to use that.

This is based on a patch by Rusty Russell, with the main difference
being that we dedicate a feature bit to guest to tell the host it is
writing the used index.  This way we don't need to force host to publish
the last available index until we have a use for it.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

Rusty,
this is a simplified form of a patch you posted in the past.
I have a vhost patch that, using this feature, shows external
to host bandwidth grow from 5 to 7 GB/s, by avoiding
an interrupt in the window after previous interrupt
was sent and before interrupts were disabled for the vq.
With vhost under some external to host loads I see
this window being hit about 30% sometimes.

I'm finalizing the host bits and plan to send
the final version for inclusion when all's ready,
but I'd like to hear comments meanwhile.

 drivers/virtio/virtio_ring.c |   28 +---
 include/linux/virtio_ring.h  |   14 +-
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 1ca8890..7729aba 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -89,9 +89,6 @@ struct vring_virtqueue
/* Number we've added since last sync. */
unsigned int num_added;
 
-   /* Last used index we've seen. */
-   u16 last_used_idx;
-
/* How to notify other side. FIXME: commonalize hcalls! */
void (*notify)(struct virtqueue *vq);
 
@@ -285,12 +282,13 @@ static void detach_buf(struct vring_virtqueue *vq, 
unsigned int head)
 
 static inline bool more_used(const struct vring_virtqueue *vq)
 {
-   return vq-last_used_idx != vq-vring.used-idx;
+   return *vq-vring.last_used_idx != vq-vring.used-idx;
 }
 
 void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
+   struct vring_used_elem *u;
void *ret;
unsigned int i;
 
@@ -307,12 +305,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned 
int *len)
return NULL;
}
 
-   /* Only get used array entries after they have been exposed by host. */
-   virtio_rmb();
-
-   i = vq-vring.used-ring[vq-last_used_idx%vq-vring.num].id;
-   *len = vq-vring.used-ring[vq-last_used_idx%vq-vring.num].len;
+   /* Only get used array entries after they have been exposed by host.
+* Need mb(), not just rmb() because we write last_used_idx below. */
+   virtio_mb();
 
+   u = vq-vring.used-ring[*vq-vring.last_used_idx % vq-vring.num];
+   i = u-id;
+   *len = u-len;
if (unlikely(i = vq-vring.num)) {
BAD_RING(vq, id %u out of range\n, i);
return NULL;
@@ -325,7 +324,8 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int 
*len)
/* detach_buf clears data, so grab it now. */
ret = vq-data[i];
detach_buf(vq, i);
-   vq-last_used_idx++;
+   (*vq-vring.last_used_idx)++;
+
END_USE(vq);
return ret;
 }
@@ -431,7 +431,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
vq-vq.name = name;
vq-notify = notify;
vq-broken = false;
-   vq-last_used_idx = 0;
+   *vq-vring.last_used_idx = 0;
vq-num_added = 0;
list_add_tail(vq-vq.list, vdev-vqs);
 #ifdef DEBUG
@@ -440,6 +440,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 
vq-indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
 
+   /* We publish used index whether Host offers it or not: if not, it's
+* junk space anyway.  But calling this acknowledges the feature. */
+   virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_USED);
+
/* No callback?  Tell other side not to bother us. */
if (!callback)
vq-vring.avail-flags |= VRING_AVAIL_F_NO_INTERRUPT;
@@ -473,6 +477,8 @@ void vring_transport_features(struct virtio_device *vdev)
switch (i) {
case VIRTIO_RING_F_INDIRECT_DESC:
break;
+   case VIRTIO_RING_F_PUBLISH_INDICES:
+   break;
default:
/* We don't understand this bit

Re: question on virtio

2010-05-05 Thread Michael S. Tsirkin

On Wed, May 05, 2010 at 02:40:15PM -0500, Anthony Liguori wrote:
 On 05/05/2010 06:09 AM, Michael S. Tsirkin wrote:
 Hi!
 I see this in virtio_ring.c:

  /* Put entry in available array (but don't update avail-idx *
 until they do sync). */

 Why is it done this way?
 It seems that updating the index straight away would be simpler, while
 this might allow the host to specilatively look up the buffer and handle
 it, without waiting for the kick.


 It should be okay as long as you don't update idx for partial vectors.

 Regards,

 Anthony Liguori

Sorry, what do you mean by partial vectors here?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC] virtio: put last seen used index into ring itself

2010-05-06 Thread Michael S. Tsirkin

On Thu, May 06, 2010 at 12:01:34PM +0930, Rusty Russell wrote:
 On Thu, 6 May 2010 06:28:14 am Michael S. Tsirkin wrote:
  Rusty,
  this is a simplified form of a patch you posted in the past.
  I have a vhost patch that, using this feature, shows external
  to host bandwidth grow from 5 to 7 GB/s, by avoiding
  an interrupt in the window after previous interrupt
  was sent and before interrupts were disabled for the vq.
  With vhost under some external to host loads I see
  this window being hit about 30% sometimes.
 
 Fascinating.  So you use this to guess if the guest is still processing?

Exactly.

 I haven't thought about it hard, but is that racy?

I thought about this really hard and I don't think it's
necessarily racy, as long as (pseudo code):
guest:
start:
disable interrupts
read(used)
write(last used)
enable interrupts
mb()
if read(used)
goto start

host:
write used
mb()
if (reached(read(last used), used))
interrupt

IOW, guest does read then write then read, host does write then read.

Now, the only way to miss an interrupt is if we read last used
value before it is written so we think guest is still processing.
But if that happens, this means that host has written used before
guest updated last used, and for this reason peek will see
used value and restart polling.

IOW, to miss an interrupt host must read a stale value.
For this host must read before guest write, then
host write already happened, so second read in
guest will see an updated value host has written.


Now, I also added an mb() in guest between read and write so
that last used index write can not get ahead of used index read.
It does feel good to have it there, but I can not say why
it's helpful. Works fine without it, but then these
subtle races might be hard to trigger. What do you think?


 Obviously happy to apply this when you finalize it.
 
 Thanks!
 Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: virtio: put last_used and last_avail index into ring itself.

2010-05-06 Thread Michael S. Tsirkin

On Thu, May 06, 2010 at 10:22:12AM +0930, Rusty Russell wrote:
 On Wed, 5 May 2010 03:52:36 am Michael S. Tsirkin wrote:
   virtio: put last_used and last_avail index into ring itself.
   
   Generally, the other end of the virtio ring doesn't need to see where
   you're up to in consuming the ring.  However, to completely understand
   what's going on from the outside, this information must be exposed.
   For example, if you want to save and restore a virtio_ring, but you're
   not the consumer because the kernel is using it directly.
   
   Fortunately, we have room to expand: the ring is always a whole number
   of pages and there's hundreds of bytes of padding after the avail ring
   and the used ring, whatever the number of descriptors (which must be a
   power of 2).
   
   We add a feature bit so the guest can tell the host that it's writing
   out the current value there, if it wants to use that.
   
   Signed-off-by: Rusty Russell ru...@rustcorp.com.au
  
  I've been looking at this patch some more (more on why
  later), and I wonder: would it be better to add some
  alignment to the last used index address, so that
  if we later add more stuff at the tail, it all
  fits in a single cache line?
 
 In theory, but not in practice.  We don't have many rings, so the
 difference between 1 and 2 cache lines is not very much.

Fair enough.

  We use a new feature bit anyway, so layout change should not be
  a problem.
  
  Since I raised the question of caches: for used ring,
  the ring is not aligned to 64 bit, so on CPUs with 64 bit
  or larger cache lines, used entries will often cross
  cache line boundaries. Am I right and might it
  have been better to align ring entries to cache line boundaries?
  
  What do you think?
 
 I think everyone is settled on 128 byte cache lines for the forseeable
 future, so it's not really an issue.
 
 Cheers,
 Rusty.

You mean with 64 bit descriptors we will be bouncing a cache line
between host and guest, anyway?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC] qemu/virtio: use last used index published by guest

2010-05-06 Thread Michael S. Tsirkin

Reduces irq_window in guest by only injecting
an interrupt if guest has handled all buffers we
used so far.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

This is the qemu part of the story.

 hw/vhost_net.c |6 ++
 hw/virtio.c|   15 +++
 hw/virtio.h|4 
 3 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/hw/vhost_net.c b/hw/vhost_net.c
index 2e292ee..d77c9b2 100644
--- a/hw/vhost_net.c
+++ b/hw/vhost_net.c
@@ -51,6 +51,9 @@ unsigned vhost_net_get_features(struct vhost_net *net, 
unsigned features)
 if (!(net-dev.features  (1  VIRTIO_RING_F_INDIRECT_DESC))) {
 features = ~(1  VIRTIO_RING_F_INDIRECT_DESC);
 }
+if (!(net-dev.features  (1  VIRTIO_RING_F_PUBLISH_USED))) {
+features = ~(1  VIRTIO_RING_F_PUBLISH_USED);
+}
 features = ~(1  VIRTIO_NET_F_MRG_RXBUF);
 return features;
 }
@@ -64,6 +67,9 @@ void vhost_net_ack_features(struct vhost_net *net, unsigned 
features)
 if (features  (1  VIRTIO_RING_F_INDIRECT_DESC)) {
 net-dev.acked_features |= (1  VIRTIO_RING_F_INDIRECT_DESC);
 }
+if (features  (1  VIRTIO_RING_F_PUBLISH_USED)) {
+net-dev.acked_features |= (1  VIRTIO_RING_F_PUBLISH_USED);
+}
 }
 
 static int vhost_net_get_fd(VLANClientState *backend)
diff --git a/hw/virtio.c b/hw/virtio.c
index e7657ae..5d686f0 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -71,6 +71,7 @@ struct VirtQueue
 target_phys_addr_t pa;
 uint16_t last_avail_idx;
 int inuse;
+int num_notify;
 uint16_t vector;
 void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
 VirtIODevice *vdev;
@@ -139,6 +140,11 @@ static inline uint16_t vring_avail_ring(VirtQueue *vq, int 
i)
 return lduw_phys(pa);
 }
 
+static inline uint16_t vring_last_used_idx(VirtQueue *vq)
+{
+return vring_avail_ring(vq, vq-vring.num);
+}
+
 static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
 {
 target_phys_addr_t pa;
@@ -234,6 +240,7 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count)
 wmb();
 vring_used_idx_increment(vq, count);
 vq-inuse -= count;
+vq-num_notify += count;
 }
 
 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
@@ -603,6 +610,14 @@ void virtio_irq(VirtQueue *vq)
 
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
+/* Do not notify if guest did not yet see the last update. */
+if ((vdev-guest_features  (1  VIRTIO_RING_F_PUBLISH_USED)) 
+   vring_last_used_idx(vq) !=
+   (uint16_t)(vring_used_idx(vq) + vq-num_notify))
+   return;
+
+vq-num_notify = 0;
+
 /* Always notify when queue is empty (when feature acknowledge) */
 if ((vring_avail_flags(vq)  VRING_AVAIL_F_NO_INTERRUPT) 
 (!(vdev-guest_features  (1  VIRTIO_F_NOTIFY_ON_EMPTY)) ||
diff --git a/hw/virtio.h b/hw/virtio.h
index f885f1b..4bdfded 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -43,6 +43,8 @@
 #define VIRTIO_F_NOTIFY_ON_EMPTY24
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC 28
+/* The Guest publishes last-seen used index at the end of the avail ring. */
+#define VIRTIO_RING_F_PUBLISH_USED 29
 /* A guest should never accept this.  It implies negotiation is broken. */
 #define VIRTIO_F_BAD_FEATURE   30
 
@@ -189,6 +191,8 @@ VirtIODevice *virtio_balloon_init(DeviceState *dev);
 void virtio_net_exit(VirtIODevice *vdev);
 
 #define DEFINE_VIRTIO_COMMON_FEATURES(_state, _field) \
+   DEFINE_PROP_BIT(publish_used, _state, _field, \
+   VIRTIO_RING_F_PUBLISH_USED, true), \
DEFINE_PROP_BIT(indirect_desc, _state, _field, \
VIRTIO_RING_F_INDIRECT_DESC, true)
 
-- 
1.7.1.12.g42b7f
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: virtio: put last_used and last_avail index into ring itself.

2010-05-09 Thread Michael S. Tsirkin

On Fri, May 07, 2010 at 12:35:39PM +0930, Rusty Russell wrote:
 On Thu, 6 May 2010 03:57:55 pm Michael S. Tsirkin wrote:
  On Thu, May 06, 2010 at 10:22:12AM +0930, Rusty Russell wrote:
   On Wed, 5 May 2010 03:52:36 am Michael S. Tsirkin wrote:
What do you think?
   
   I think everyone is settled on 128 byte cache lines for the forseeable
   future, so it's not really an issue.
  
  You mean with 64 bit descriptors we will be bouncing a cache line
  between host and guest, anyway?
 
 I'm confused by this entire thread.
 
 Descriptors are 16 bytes.  They are at the start, so presumably aligned to
 cache boundaries.
 
 Available ring follows that at 2 bytes per entry, so it's also packed nicely
 into cachelines.
 
 Then there's padding to page boundary.  That puts us on a cacheline again
 for the used ring; also 2 bytes per entry.
 

Hmm, is used ring really 2 bytes per entry?


/* u32 is used here for ids for padding reasons. */
struct vring_used_elem {
/* Index of start of used descriptor chain. */
__u32 id;
/* Total length of the descriptor chain which was used (written to) */
__u32 len;
};

struct vring_used {
__u16 flags;
__u16 idx;
struct vring_used_elem ring[];
};

 I don't see how any change in layout could be more cache friendly?
 Rusty.

I thought that used ring has 8 bytes per entry, and that struct
vring_used is aligned at page boundary, this
would mean that ring element is at offset 4 bytes from page boundary.
Thus with cacheline size 128 bytes, each 4th element crosses
a cacheline boundary. If we had a 4 byte padding after idx, each
used element would always be completely within a single cacheline.

What am I missing?
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.

2010-05-09 Thread Michael S. Tsirkin

On Sat, May 08, 2010 at 03:55:48PM +0800, Xin, Xiaohui wrote:
 Michael,
 Sorry, somehow I missed this mail. :-(
 
  Here, we have ever considered 2 ways to utilize the page constructor
  API to dispense the user buffers.
  
  One:   Modify __alloc_skb() function a bit, it can only allocate a 
 structure of sk_buff, and the data pointer is pointing to a 
 user buffer which is coming from a page constructor API.
 Then the shinfo of the skb is also from guest.
 When packet is received from hardware, the skb-data is filled
 directly by h/w. What we have done is in this way.
  
 Pros:   We can avoid any copy here.
 Cons:   Guest virtio-net driver needs to allocate skb as almost
 the same method with the host NIC drivers, say the size
 of netdev_alloc_skb() and the same reserved space in the
 head of skb. Many NIC drivers are the same with guest and
 ok for this. But some lastest NIC drivers reserves special
 room in skb head. To deal with it, we suggest to provide
 a method in guest virtio-net driver to ask for parameter
 we interest from the NIC driver when we know which device 
 we have bind to do zero-copy. Then we ask guest to do so.
 Is that reasonable?
 
 Do you still do this?
 
 Currently, we still use the first way. But we now ignore the room which 
 host skb_reserve() required when device is doing zero-copy. Now we don't 
 taint guest virtio-net driver with a new method by this way.
 
  Two:   Modify driver to get user buffer allocated from a page 
  constructor
 API(to substitute alloc_page()), the user buffer are used as payload
 buffers and filled by h/w directly when packet is received. Driver
 should associate the pages with skb (skb_shinfo(skb)-frags). For 
 the head buffer side, let host allocates skb, and h/w fills it. 
 After that, the data filled in host skb header will be copied into
 guest header buffer which is submitted together with the payload buffer.
  
 Pros:   We could less care the way how guest or host allocates their
 buffers.
 Cons:   We still need a bit copy here for the skb header.
  
  We are not sure which way is the better here. This is the first thing we 
  want
  to get comments from the community. We wish the modification to the network
  part will be generic which not used by vhost-net backend only, but a user
  application may use it as well when the zero-copy device may provides async
  read/write operations later.
 
 I commented on this in the past. Do you still want comments?
 
 Now we continue with the first way and try to push it. But any comments about 
 the two methods are still welcome.
 
 That's nice. The thing to do is probably to enable GSO/TSO
 and see what we get this way. Also, mergeable buffer support
 was recently posted and I hope to merge it for 2.6.35.
 You might want to take a look.
 
 I'm looking at the mergeable buffer. I think GSO/GRO support with zero-copy 
 also needs it.
 Currently, GSO/TSO is still not supported by vhost-net?

GSO/TSO are currently supported with tap and macvtap,
AF_PACKET socket backend still needs some work to
enable GSO.

 -- 
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC] virtio: put last seen used index into ring itself

2010-05-09 Thread Michael S. Tsirkin

On Fri, May 07, 2010 at 01:03:28PM +0930, Rusty Russell wrote:
 On Thu, 6 May 2010 03:49:46 pm Michael S. Tsirkin wrote:
  Now, I also added an mb() in guest between read and write so
  that last used index write can not get ahead of used index read.
  It does feel good to have it there, but I can not say why
  it's helpful. Works fine without it, but then these
  subtle races might be hard to trigger. What do you think?
 
 I couldn't see that in the patch?  I don't think it's necessary
 though, since the write of depends last_used depends on the read of
 used (and no platform we care about would reorder such a thing).

Well, there's no data dependency, is there?

 I'm reasonably happy, but we should write some convenient test for
 missing interrupts.
 
 I'm thinking of a sender which does a loop: blasts 1MB of UDP packets,
 then prints the time and sleep(1).  The receiver would print the time
 every 1MB of received data.  The two times should almost exactly correspond.
 
 Assuming that the network doesn't overflow and lose stuff, this should
 identify any missing wakeup/interrupts (depending on direction used).
 
 Cheers,
 Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-05-10 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
 @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
   use_mm(net-dev.mm);
   mutex_lock(vq-mutex);
   vhost_disable_notify(vq);
 - hdr_size = vq-hdr_size;
 + vhost_hlen = vq-vhost_hlen;
  
   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
   vq-log : NULL;
  
 - for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -  ARRAY_SIZE(vq-iov),
 -  out, in,
 -  vq_log, log);
 + while ((datalen = vhost_head_len(vq, sock-sk))) {
 + headcount = vhost_get_desc_n(vq, vq-heads,
 +  datalen + vhost_hlen,
 +  in, vq_log, log);
 + if (headcount  0)
 + break;
   /* OK, now we need to know about added descriptors. */
 - if (head == vq-num) {
 + if (!headcount) {
   if (unlikely(vhost_enable_notify(vq))) {
   /* They have slipped one in as we were
* doing that: check again. */
 @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
   break;
   }
   /* We don't need to be notified again. */
 - if (out) {
 - vq_err(vq, Unexpected descriptor format for RX: 
 -out %d, int %d\n,
 -out, in);
 - break;
 - }
 - /* Skip header. TODO: support TSO/mergeable rx buffers. */
 - s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, in);
 + if (vhost_hlen)
 + /* Skip header. TODO: support TSO. */
 + s = move_iovec_hdr(vq-iov, vq-hdr, vhost_hlen, in);
 + else
 + s = copy_iovec_hdr(vq-iov, vq-hdr, vq-sock_hlen, in);
   msg.msg_iovlen = in;
   len = iov_length(vq-iov, in);
   /* Sanity check */
   if (!len) {
   vq_err(vq, Unexpected header len for RX: 
  %zd expected %zd\n,
 -iov_length(vq-hdr, s), hdr_size);
 +iov_length(vq-hdr, s), vhost_hlen);
   break;
   }
   err = sock-ops-recvmsg(NULL, sock, msg,
len, MSG_DONTWAIT | MSG_TRUNC);
   /* TODO: Check specific error and bomb out unless EAGAIN? */
   if (err  0) {
 - vhost_discard_vq_desc(vq);
 + vhost_discard_desc(vq, headcount);
   break;
   }
 - /* TODO: Should check and handle checksum. */
 - if (err  len) {
 - pr_err(Discarded truncated rx packet: 
 - len %d  %zd\n, err, len);
 - vhost_discard_vq_desc(vq);
 + if (err != datalen) {
 + pr_err(Discarded rx packet: 
 + len %d, expected %zd\n, err, datalen);
 + vhost_discard_desc(vq, headcount);
   continue;
   }
   len = err;
 - err = memcpy_toiovec(vq-hdr, (unsigned char *)hdr, hdr_size);
 - if (err) {
 - vq_err(vq, Unable to write vnet_hdr at addr %p: %d\n,
 -vq-iov-iov_base, err);
 + if (vhost_hlen 
 + memcpy_toiovecend(vq-hdr, (unsigned char *)hdr, 0,
 +   vhost_hlen)) {
 + vq_err(vq, Unable to write vnet_hdr at addr %p\n,
 +vq-iov-iov_base);
   break;
   }
 - len += hdr_size;
 - vhost_add_used_and_signal(net-dev, vq, head, len);
 + /* TODO: Should check and handle checksum. */
 + if (vhost_has_feature(net-dev, VIRTIO_NET_F_MRG_RXBUF) 
 + memcpy_toiovecend(vq-hdr, (unsigned char *)headcount,
 +   offsetof(typeof(hdr), num_buffers),
 +   sizeof(hdr.num_buffers))) {
 + vq_err(vq, Failed num_buffers write);
 + vhost_discard_desc(vq, headcount);
 + break;
 + }
 + len += vhost_hlen;
 + vhost_add_used_and_signal_n(net-dev, vq, vq-heads,
 + headcount);
   if (unlikely(vq_log))
   vhost_log_write(vq, vq_log, log, len);
   total_len += len;

OK I think I see the bug here: vhost_add_used_and_signal_n
does not get the actual length, it

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-05-10 Thread Michael S. Tsirkin

On Mon, May 10, 2010 at 10:09:03AM -0700, David Stevens wrote:
 Since datalen carries the difference and will be negative by that amount
 from the original loop, what about just adding something like:
 
 }
 if (headcount)
 heads[headcount-1].len += datalen;
 [and really, headcount 0 since datalen  0, so just:
 
 heads[headcount-1].len += datalen;
 
 +-DLS

This works too, just does more checks and comparisons.
I am still surprised that you were unable to reproduce the problem.

 
 kvm-ow...@vger.kernel.org wrote on 05/10/2010 09:43:03 AM:
 
  On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
   @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
   use_mm(net-dev.mm);
   mutex_lock(vq-mutex);
   vhost_disable_notify(vq);
   -   hdr_size = vq-hdr_size;
   +   vhost_hlen = vq-vhost_hlen;
   
   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
  vq-log : NULL;
   
   -   for (;;) {
   -  head = vhost_get_vq_desc(net-dev, vq, vq-iov,
   -ARRAY_SIZE(vq-iov),
   -out, in,
   -vq_log, log);
   +   while ((datalen = vhost_head_len(vq, sock-sk))) {
   +  headcount = vhost_get_desc_n(vq, vq-heads,
   +datalen + vhost_hlen,
   +in, vq_log, log);
   +  if (headcount  0)
   + break;
  /* OK, now we need to know about added descriptors. */
   -  if (head == vq-num) {
   +  if (!headcount) {
 if (unlikely(vhost_enable_notify(vq))) {
/* They have slipped one in as we were
 * doing that: check again. */
   @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
 break;
  }
  /* We don't need to be notified again. */
   -  if (out) {
   - vq_err(vq, Unexpected descriptor format for RX: 
   -out %d, int %d\n,
   -out, in);
   - break;
   -  }
   -  /* Skip header. TODO: support TSO/mergeable rx buffers. */
   -  s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, in);
   +  if (vhost_hlen)
   + /* Skip header. TODO: support TSO. */
   + s = move_iovec_hdr(vq-iov, vq-hdr, vhost_hlen, in);
   +  else
   + s = copy_iovec_hdr(vq-iov, vq-hdr, vq-sock_hlen, in);
  msg.msg_iovlen = in;
  len = iov_length(vq-iov, in);
  /* Sanity check */
  if (!len) {
 vq_err(vq, Unexpected header len for RX: 
%zd expected %zd\n,
   -iov_length(vq-hdr, s), hdr_size);
   +iov_length(vq-hdr, s), vhost_hlen);
 break;
  }
  err = sock-ops-recvmsg(NULL, sock, msg,
len, MSG_DONTWAIT | MSG_TRUNC);
  /* TODO: Check specific error and bomb out unless EAGAIN? */
  if (err  0) {
   - vhost_discard_vq_desc(vq);
   + vhost_discard_desc(vq, headcount);
 break;
  }
   -  /* TODO: Should check and handle checksum. */
   -  if (err  len) {
   - pr_err(Discarded truncated rx packet: 
   - len %d  %zd\n, err, len);
   - vhost_discard_vq_desc(vq);
   +  if (err != datalen) {
   + pr_err(Discarded rx packet: 
   + len %d, expected %zd\n, err, datalen);
   + vhost_discard_desc(vq, headcount);
 continue;
  }
  len = err;
   -  err = memcpy_toiovec(vq-hdr, (unsigned char *)hdr, hdr_size);
   -  if (err) {
   - vq_err(vq, Unable to write vnet_hdr at addr %p: %d\n,
   -vq-iov-iov_base, err);
   +  if (vhost_hlen 
   +  memcpy_toiovecend(vq-hdr, (unsigned char *)hdr, 0,
   +  vhost_hlen)) {
   + vq_err(vq, Unable to write vnet_hdr at addr %p\n,
   +vq-iov-iov_base);
 break;
  }
   -  len += hdr_size;
   -  vhost_add_used_and_signal(net-dev, vq, head, len);
   +  /* TODO: Should check and handle checksum. */
   +  if (vhost_has_feature(net-dev, VIRTIO_NET_F_MRG_RXBUF) 
   +  memcpy_toiovecend(vq-hdr, (unsigned char *)headcount,
   +  offsetof(typeof(hdr), num_buffers),
   +  sizeof(hdr.num_buffers))) {
   + vq_err(vq, Failed num_buffers write);
   + vhost_discard_desc(vq, headcount);
   + break;
   +  }
   +  len += vhost_hlen;
   +  vhost_add_used_and_signal_n(net-dev, vq, vq-heads,
   +   headcount);
  if (unlikely(vq_log))
 vhost_log_write(vq, vq_log, log, len);
  total_len += len;
  
  OK I think I see the bug here: vhost_add_used_and_signal_n
  does not get the actual length, it gets the iovec length from vhost.
  Guest virtio uses this as packet length, with bad results.
  
  So I have applied the

Re: [PATCHv7] add mergeable buffers support to vhost_net

2010-05-10 Thread Michael S. Tsirkin

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
 @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
   use_mm(net-dev.mm);
   mutex_lock(vq-mutex);
   vhost_disable_notify(vq);
 - hdr_size = vq-hdr_size;
 + vhost_hlen = vq-vhost_hlen;
  
   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
   vq-log : NULL;
  
 - for (;;) {
 - head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 -  ARRAY_SIZE(vq-iov),
 -  out, in,
 -  vq_log, log);
 + while ((datalen = vhost_head_len(vq, sock-sk))) {
 + headcount = vhost_get_desc_n(vq, vq-heads,
 +  datalen + vhost_hlen,
 +  in, vq_log, log);
 + if (headcount  0)
 + break;
   /* OK, now we need to know about added descriptors. */
 - if (head == vq-num) {
 + if (!headcount) {
   if (unlikely(vhost_enable_notify(vq))) {
   /* They have slipped one in as we were
* doing that: check again. */


So I think this breaks handling for a failure mode where
we get an skb that is larger than the max packet guest
can get. The right thing to do in this case is to
drop the skb, we currently do this by passing
truncate flag to recvmsg.

In particular, with mergeable buffers off, if we get an skb
that does not fit in a single packet, this code will
spread it over multiple buffers.

You should be able to reproduce this fairly easily
by disabling both indirect buffers and mergeable buffers
on qemu command line. With current code TCP still
works by falling back on small packets. I think
with your code it will get stuck forever once
we get an skb that is too large for us to handle.

-- 
MST


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC] vhost: fix barrier pairing

2010-05-11 Thread Michael S. Tsirkin

According to memory-barriers.txt, an smp memory barrier
should always be paired with another smp memory barrier,
and I quote a lack of appropriate pairing is almost certainly an
error.

In case of vhost, failure to flush out used index
update before looking at the interrupt disable flag
could result in missed interrupts, resulting in
networking hang under stress.

This might happen when flags read bypasses used index write.
So we see interrupts disabled and do not interrupt, at the
same time guest writes flags value to enable interrupt,
reads an old used index value, thinks that
used ring is empty and waits for interrupt.

Note: the barrier we pair with here is in
drivers/virtio/virtio_ring.c, function
vring_enable_cb.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

Dave, I think this is needed in 2.6.34, I'll send a pull
request after doing some more testing.

Rusty, Juan, could you take a look as well please?
Thanks!

 drivers/vhost/vhost.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e69d238..14fa2f5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1035,7 +1035,10 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned 
int head, int len)
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __u16 flags = 0;
+   __u16 flags;
+   /* Flush out used index updates. */
+   smp_mb();
+
if (get_user(flags, vq-avail-flags)) {
vq_err(vq, Failed to get flags);
return;
-- 
1.7.1.12.g42b7f
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

virtio: power management

2010-05-11 Thread Michael S. Tsirkin

Anyone looked at power management with virtio?
virtio-pci has callbacks to save/restore pci config
on suspend/resume, but it seems that more action,
such as restoring queue state, would be needed
for e.g. suspend to disk to work.

Rusty, any hints on the code in virtio-pci
that deals with suspend? Is it incomplete?

Thanks,

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC] virtio: put last seen used index into ring itself

2010-05-11 Thread Michael S. Tsirkin

On Tue, May 11, 2010 at 01:46:08PM -0500, Ryan Harper wrote:
 * Michael S. Tsirkin m...@redhat.com [2010-05-05 16:37]:
  Generally, the Host end of the virtio ring doesn't need to see where
  Guest is up to in consuming the ring.  However, to completely understand
  what's going on from the outside, this information must be exposed.
  For example, host can reduce the number of interrupts by detecting
  that the guest is currently handling previous buffers.
  
  Fortunately, we have room to expand: the ring is always a whole number
  of pages and there's hundreds of bytes of padding after the avail ring
  and the used ring, whatever the number of descriptors (which must be a
  power of 2).
  
  We add a feature bit so the guest can tell the host that it's writing
  out the current value there, if it wants to use that.
  
  This is based on a patch by Rusty Russell, with the main difference
  being that we dedicate a feature bit to guest to tell the host it is
  writing the used index.  This way we don't need to force host to publish
  the last available index until we have a use for it.
  
  Signed-off-by: Rusty Russell ru...@rustcorp.com.au
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
  ---
  
  Rusty,
  this is a simplified form of a patch you posted in the past.
  I have a vhost patch that, using this feature, shows external
  to host bandwidth grow from 5 to 7 GB/s, by avoiding
  an interrupt in the window after previous interrupt
  was sent and before interrupts were disabled for the vq.
  With vhost under some external to host loads I see
  this window being hit about 30% sometimes.
  
  I'm finalizing the host bits and plan to send
  the final version for inclusion when all's ready,
  but I'd like to hear comments meanwhile.
  
   drivers/virtio/virtio_ring.c |   28 +---
   include/linux/virtio_ring.h  |   14 +-
   2 files changed, 30 insertions(+), 12 deletions(-)
  
  diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
  index 1ca8890..7729aba 100644
  --- a/drivers/virtio/virtio_ring.c
  +++ b/drivers/virtio/virtio_ring.c
  @@ -89,9 +89,6 @@ struct vring_virtqueue
  /* Number we've added since last sync. */
  unsigned int num_added;
  
  -   /* Last used index we've seen. */
  -   u16 last_used_idx;
  -
  /* How to notify other side. FIXME: commonalize hcalls! */
  void (*notify)(struct virtqueue *vq);
  
  @@ -285,12 +282,13 @@ static void detach_buf(struct vring_virtqueue *vq, 
  unsigned int head)
  
   static inline bool more_used(const struct vring_virtqueue *vq)
   {
  -   return vq-last_used_idx != vq-vring.used-idx;
  +   return *vq-vring.last_used_idx != vq-vring.used-idx;
   }
  
   void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
   {
  struct vring_virtqueue *vq = to_vvq(_vq);
  +   struct vring_used_elem *u;
  void *ret;
  unsigned int i;
  
  @@ -307,12 +305,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, 
  unsigned int *len)
  return NULL;
  }
  
  -   /* Only get used array entries after they have been exposed by host. */
  -   virtio_rmb();
  -
  -   i = vq-vring.used-ring[vq-last_used_idx%vq-vring.num].id;
  -   *len = vq-vring.used-ring[vq-last_used_idx%vq-vring.num].len;
  +   /* Only get used array entries after they have been exposed by host.
  +* Need mb(), not just rmb() because we write last_used_idx below. */
  +   virtio_mb();
  
  +   u = vq-vring.used-ring[*vq-vring.last_used_idx % vq-vring.num];
  +   i = u-id;
  +   *len = u-len;
  if (unlikely(i = vq-vring.num)) {
  BAD_RING(vq, id %u out of range\n, i);
  return NULL;
  @@ -325,7 +324,8 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned 
  int *len)
  /* detach_buf clears data, so grab it now. */
  ret = vq-data[i];
  detach_buf(vq, i);
  -   vq-last_used_idx++;
  +   (*vq-vring.last_used_idx)++;
  +
  END_USE(vq);
  return ret;
   }
  @@ -431,7 +431,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
  vq-vq.name = name;
  vq-notify = notify;
  vq-broken = false;
  -   vq-last_used_idx = 0;
  +   *vq-vring.last_used_idx = 0;
  vq-num_added = 0;
  list_add_tail(vq-vq.list, vdev-vqs);
   #ifdef DEBUG
  @@ -440,6 +440,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
  
  vq-indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
  
  +   /* We publish used index whether Host offers it or not: if not, it's
  +* junk space anyway.  But calling this acknowledges the feature. */
  +   virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_USED);
  +
 
 You use VIRTIO_RING_F_PUBLISH_USED here, but
 VIRTIO_RING_F_PUBLISH_INDICES below... 
 
 
  /* No callback?  Tell other side not to bother us. */
  if (!callback)
  vq-vring.avail-flags |= VRING_AVAIL_F_NO_INTERRUPT;
  @@ -473,6 +477,8 @@ void vring_transport_features(struct virtio_device 
  *vdev

Re: [Qemu-devel] [PATCH RFC] virtio: put last seen used index into ring itself

2010-05-11 Thread Michael S. Tsirkin

On Tue, May 11, 2010 at 10:27:22PM +0300, Avi Kivity wrote:
 On 05/07/2010 06:23 AM, Rusty Russell wrote:
 On Thu, 6 May 2010 07:30:00 pm Avi Kivity wrote:

 On 05/05/2010 11:58 PM, Michael S. Tsirkin wrote:
  
 +  /* We publish the last-seen used index at the end of the available ring.
 +   * It is at the end for backwards compatibility. */
 +  vr-last_used_idx =(vr)-avail-ring[num];
 +  /* Verify that last used index does not spill over the used ring. */
 +  BUG_ON((void *)vr-last_used_idx +
 + sizeof *vr-last_used_idx   (void *)vr-used);
}


 Shouldn't this be on its own cache line?
  
 It's next to the available ring; because that's where the guest publishes
 its data.  That whole page is guest-write, host-read.

 Putting it on a cacheline by itself would be a slight pessimization; the host
 cpu would have to get the last_used_idx cacheline and the avail descriptor
 cacheline every time.  This way, they are sometimes the same cacheline.


 If one peer writes the tail of the available ring, while the other reads  
 last_used_idx, it's a false bounce, no?

 Having things on the same cacheline is only worthwhile if they are  
 accessed at the same time.

Yes, this is what I was trying to say.
avail flags and used index *are* accessed at the same time, so
there could be an advantage to sharing a cache line there.

All this should be kept in mind if we ever do
VIRTIO_RING_F_NEW_LAYOUT.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] pci: cleanly backout of pci_qdev_init()

2010-05-11 Thread Michael S. Tsirkin

On Tue, May 11, 2010 at 06:44:21AM -0400, Alex Williamson wrote:
 If the init function of a device fails, as might happen with device
 assignment, we never undo the work done by do_pci_register_device().
 This not only causes a bit of a memory leak, but also leaves a bogus
 pointer in the bus devices array that can cause a segfault or
 garbage data from 'info pci'.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com

Thanks, applied.

 ---
 
 v2: Remove extraneous return from do_pci_unregister_device()(
 
  hw/pci.c |   16 +++-
  1 files changed, 11 insertions(+), 5 deletions(-)
 
 diff --git a/hw/pci.c b/hw/pci.c
 index f167436..3ca5f09 100644
 --- a/hw/pci.c
 +++ b/hw/pci.c
 @@ -625,6 +625,13 @@ static PCIDevice *do_pci_register_device(PCIDevice 
 *pci_dev, PCIBus *bus,
  return pci_dev;
  }
  
 +static void do_pci_unregister_device(PCIDevice *pci_dev)
 +{
 +qemu_free_irqs(pci_dev-irq);
 +pci_dev-bus-devices[pci_dev-devfn] = NULL;
 +pci_config_free(pci_dev);
 +}
 +
  PCIDevice *pci_register_device(PCIBus *bus, const char *name,
 int instance_size, int devfn,
 PCIConfigReadFunc *config_read,
 @@ -680,10 +687,7 @@ static int pci_unregister_device(DeviceState *dev)
  return ret;
  
  pci_unregister_io_regions(pci_dev);
 -
 -qemu_free_irqs(pci_dev-irq);
 -pci_dev-bus-devices[pci_dev-devfn] = NULL;
 -pci_config_free(pci_dev);
 +do_pci_unregister_device(pci_dev);
  return 0;
  }
  
 @@ -1652,8 +1656,10 @@ static int pci_qdev_init(DeviceState *qdev, DeviceInfo 
 *base)
  if (pci_dev == NULL)
  return -1;
  rc = info-init(pci_dev);
 -if (rc != 0)
 +if (rc != 0) {
 +do_pci_unregister_device(pci_dev);
  return rc;
 +}
  
  /* rom loading */
  if (pci_dev-romfile == NULL  info-romfile != NULL)
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL] last minute vhost-net fix

2010-05-13 Thread Michael S. Tsirkin

David, if it's not too late, please pull the following
last minute fix into 2.6.34.
Thanks!

The following changes since commit de02d72bb3cc5b3d4c873db4ca8291723dd48479:

  Merge branch 'master' of 
git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6 (2010-05-10 
22:53:41 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git net-2.6

Michael S. Tsirkin (1):
  vhost: fix barrier pairing

 drivers/vhost/vhost.c |7 ++-
 1 files changed, 6 insertions(+), 1 deletions(-)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 4680 matches

Mail list logo