Re: [RFC] vhost-blk implementation

2010-04-08 Thread Stefan Hajnoczi
On Fri, Mar 26, 2010 at 6:53 PM, Eran Rom er...@il.ibm.com wrote:
 Christoph Hellwig hch at infradead.org writes:


 Ok.  cache=writeback performance is something I haven't bothered looking
 at at all.  For cache=none any streaming write or random workload with
 large enough record sizes got basically the same performance as native
 using kernel aio, and same for write but slightly degraded for reads
 using the thread pool.  See my attached JLS presentation for some
 numbers.

 Looks like the presentation did not make it...

I am interested in the JLS presentation too.  Here is what I found,
hope it's the one you meant, Christoph:

http://events.linuxfoundation.org/images/stories/slides/jls09/jls09_hellwig.odp

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] vhost-blk implementation (v2)

2010-04-06 Thread Badari Pulavarty
Hi All,

Here is the latest version of vhost-blk implementation.
Major difference from my previous implementation is that, I
now merge all contiguous requests (both read and write), before
submitting them. This significantly improved IO performance.
I am still collecting performance numbers, I will be posting
in next few days.

Comments ?

Todo:
- Address hch's comments on annontations
- Implement per device read/write queues
- Finish up error handling

Thanks,
Badari

---
 drivers/vhost/blk.c |  445 
 1 file changed, 445 insertions(+)

Index: net-next/drivers/vhost/blk.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ net-next/drivers/vhost/blk.c2010-04-06 16:38:03.563847905 -0400
@@ -0,0 +1,445 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include linux/compat.h
+#include linux/eventfd.h
+#include linux/vhost.h
+#include linux/virtio_net.h
+#include linux/virtio_blk.h
+#include linux/mmu_context.h
+#include linux/miscdevice.h
+#include linux/module.h
+#include linux/mutex.h
+#include linux/workqueue.h
+#include linux/rcupdate.h
+#include linux/file.h
+
+#include vhost.h
+
+#define VHOST_BLK_VQ_MAX 1
+#define SECTOR_SHIFT 9
+
+struct vhost_blk {
+   struct vhost_dev dev;
+   struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+   struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+   struct list_head list;
+   struct work_struct work;
+   struct vhost_blk *blk;
+   struct file *file;
+   int head;
+   uint32_t type;
+   uint32_t nvecs;
+   uint64_t sector;
+   uint64_t len;
+   struct iovec iov[0];
+};
+
+static struct workqueue_struct *vblk_workqueue;
+static LIST_HEAD(write_queue);
+static LIST_HEAD(read_queue);
+
+static void handle_io_work(struct work_struct *work)
+{
+   struct vhost_blk_io *vbio, *entry;
+   struct vhost_virtqueue *vq;
+   struct vhost_blk *blk;
+   struct list_head single, *head, *node, *tmp;
+
+   int i, need_free, ret = 0;
+   loff_t pos;
+   uint8_t status = 0;
+
+   vbio = container_of(work, struct vhost_blk_io, work);
+   blk = vbio-blk;
+   vq = blk-dev.vqs[0];
+   pos = vbio-sector  8;
+
+   use_mm(blk-dev.mm);
+   if (vbio-type  VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1);
+   } else if (vbio-type  VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos);
+   } else {
+   ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos);
+   }
+   status = (ret  0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+   if (vbio-head != -1) {
+   INIT_LIST_HEAD(single);
+   list_add(vbio-list, single);
+   head = single;
+   need_free = 0;
+   } else {
+   head = vbio-list;
+   need_free = 1;
+   }
+   list_for_each_entry(entry, head, list) {
+   copy_to_user(entry-iov[entry-nvecs].iov_base, status, sizeof 
status);
+   }
+   mutex_lock(vq-mutex);
+   list_for_each_safe(node, tmp, head) {
+   entry = list_entry(node, struct vhost_blk_io, list);
+   vhost_add_used_and_signal(blk-dev, vq, entry-head, ret);
+   list_del(node);
+   kfree(entry);
+   }
+   mutex_unlock(vq-mutex);
+   unuse_mm(blk-dev.mm);
+   if (need_free)
+   kfree(vbio);
+}
+
+static struct vhost_blk_io *allocate_vbio(int nvecs)
+{
+   struct vhost_blk_io *vbio;
+   int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec);
+   vbio = kmalloc(size, GFP_KERNEL);
+   if (vbio) {
+   INIT_WORK(vbio-work, handle_io_work);
+   INIT_LIST_HEAD(vbio-list);
+   }
+   return vbio;
+}
+
+static void merge_and_handoff_work(struct list_head *queue)
+{
+   struct vhost_blk_io *vbio, *entry;
+   int nvecs = 0;
+   int entries = 0;
+
+   list_for_each_entry(entry, queue, list) {
+   nvecs += entry-nvecs;
+   entries++;
+   }
+
+   if (entries == 1) {
+   vbio = list_first_entry(queue, struct vhost_blk_io, list);
+   list_del(vbio-list);
+   queue_work(vblk_workqueue, vbio-work);
+   return;
+   }
+
+   vbio = allocate_vbio(nvecs);
+   if (!vbio) {
+   /* Unable to allocate memory - submit IOs individually */
+   list_for_each_entry(vbio, queue, list) {
+   queue_work(vblk_workqueue, vbio-work);
+   }
+   INIT_LIST_HEAD(queue);
+   return;
+   }
+
+   entry = list_first_entry(queue, struct vhost_blk_io, list);
+   vbio-nvecs = nvecs;
+   vbio-blk = entry-blk;
+   

Re: [RFC] vhost-blk implementation

2010-04-05 Thread Stefan Hajnoczi
On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty pbad...@us.ibm.com wrote:
 +static void handle_io_work(struct work_struct *work)
 +{
 +       struct vhost_blk_io *vbio;
 +       struct vhost_virtqueue *vq;
 +       struct vhost_blk *blk;
 +       int i, ret = 0;
 +       loff_t pos;
 +       uint8_t status = 0;
 +
 +       vbio = container_of(work, struct vhost_blk_io, work);
 +       blk = vbio-blk;
 +       vq = blk-dev.vqs[0];
 +       pos = vbio-sector  8;
 +
 +       use_mm(blk-dev.mm);
 +
 +       if (vbio-type  VIRTIO_BLK_T_FLUSH)  {
 +               ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1);
 +       } else if (vbio-type  VIRTIO_BLK_T_OUT) {
 +               ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos);
 +       } else {
 +               ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos);
 +       }
 +
 +       status = (ret  0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
 +       if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof 
 status)  0) {
 +               printk(copy to user failed\n);
 +               vhost_discard_vq_desc(vq);
 +               unuse_mm(blk-dev.mm);
 +               return;

Do you need to kfree(vbio) here?

 +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int 
 fd)
 +{
 +       struct file *file;
 +       struct vhost_virtqueue *vq;
 +
 +       file = fget(fd);
 +       if (!file)
 +               return -EBADF;
 +
 +       vq = n-vqs + index;
 +       mutex_lock(vq-mutex);
 +       rcu_assign_pointer(vq-private_data, file);
 +       mutex_unlock(vq-mutex);
 +       return 0;
 +}
 +
 +
 +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
 +                            unsigned long arg)
 +{
 +       struct vhost_blk *n = f-private_data;
 +       void __user *argp = (void __user *)arg;
 +       struct vhost_vring_file backend;
 +       int r;
 +
 +       switch (ioctl) {
 +        case VHOST_NET_SET_BACKEND:
 +               r = copy_from_user(backend, argp, sizeof backend);
 +               if (r  0)
 +                       return r;
 +               return vhost_blk_set_backend(n, backend.index, backend.fd);

I don't see backend.index being checked against VHOST_BLK_VQ_MAX.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-04-05 Thread Christoph Hellwig
On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote:
 iovecs and buffers are user-space pointers (from the host kernel point  
 of view). They are
 guest address. So, I don't need to do any set_fs tricks.

From verifying the code and using the sparse annotations it appears
that the actual buffers are userspace pointers, but the iovecs in
the virtqueue are kernel level pointers, so you would need some
annotations.

While we're at it here is a patch fixing the remaining sparse 
warnings in vhost-blk:


Signed-off-by: Christoph Hellwig h...@lst.de

Index: linux-2.6/drivers/vhost/blk.c
===
--- linux-2.6.orig/drivers/vhost/blk.c  2010-04-05 21:15:11.638004250 +0200
+++ linux-2.6/drivers/vhost/blk.c   2010-04-05 21:16:13.238004599 +0200
@@ -86,7 +86,7 @@ static void handle_blk(struct vhost_blk
nvecs++;
BUG_ON(vq-iov[nvecs].iov_len != 1);
 
-   if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof 
status)  0) {
+   if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof 
status)) {
printk(copy to user failed\n);
vhost_discard_vq_desc(vq);
break;
@@ -199,7 +199,7 @@ static struct miscdevice vhost_blk_misc
vhost_blk_fops,
 };
 
-int vhost_blk_init(void)
+static int vhost_blk_init(void)
 {
int r = vhost_init();
if (r)
@@ -216,7 +216,7 @@ err_init:
 }
 module_init(vhost_blk_init);
 
-void vhost_blk_exit(void)
+static void vhost_blk_exit(void)
 {
misc_deregister(vhost_blk_misc);
vhost_cleanup();
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-04-05 Thread Christoph Hellwig
On Thu, Mar 25, 2010 at 04:00:56PM +0100, Asdo wrote:
 Would the loop device provide the features of a block device? I recall  
 barrier support at least has been added recently.

It does, but not in a very efficient way.

 Is it recommended to run kvm on a loopback mounted file compared to on a  
 raw file?

No.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-04-05 Thread Badari Pulavarty
On Mon, 2010-04-05 at 15:23 -0400, Christoph Hellwig wrote:
 On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote:
  iovecs and buffers are user-space pointers (from the host kernel point  
  of view). They are
  guest address. So, I don't need to do any set_fs tricks.
 
 From verifying the code and using the sparse annotations it appears
 that the actual buffers are userspace pointers, but the iovecs in
 the virtqueue are kernel level pointers, so you would need some
 annotations.

Yes. Thats correct. I will add appropriate annotations.

 
 While we're at it here is a patch fixing the remaining sparse 
 warnings in vhost-blk:

Applied.

Thanks,
Badari

 
 Signed-off-by: Christoph Hellwig h...@lst.de
 
 Index: linux-2.6/drivers/vhost/blk.c
 ===
 --- linux-2.6.orig/drivers/vhost/blk.c2010-04-05 21:15:11.638004250 
 +0200
 +++ linux-2.6/drivers/vhost/blk.c 2010-04-05 21:16:13.238004599 +0200
 @@ -86,7 +86,7 @@ static void handle_blk(struct vhost_blk
   nvecs++;
   BUG_ON(vq-iov[nvecs].iov_len != 1);
 
 - if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof 
 status)  0) {
 + if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof 
 status)) {
   printk(copy to user failed\n);
   vhost_discard_vq_desc(vq);
   break;
 @@ -199,7 +199,7 @@ static struct miscdevice vhost_blk_misc
   vhost_blk_fops,
  };
 
 -int vhost_blk_init(void)
 +static int vhost_blk_init(void)
  {
   int r = vhost_init();
   if (r)
 @@ -216,7 +216,7 @@ err_init:
  }
  module_init(vhost_blk_init);
 
 -void vhost_blk_exit(void)
 +static void vhost_blk_exit(void)
  {
   misc_deregister(vhost_blk_misc);
   vhost_cleanup();

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-04-05 Thread Badari Pulavarty
On Mon, 2010-04-05 at 15:22 +0100, Stefan Hajnoczi wrote:
 On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty pbad...@us.ibm.com wrote:
  +static void handle_io_work(struct work_struct *work)
  +{
  +   struct vhost_blk_io *vbio;
  +   struct vhost_virtqueue *vq;
  +   struct vhost_blk *blk;
  +   int i, ret = 0;
  +   loff_t pos;
  +   uint8_t status = 0;
  +
  +   vbio = container_of(work, struct vhost_blk_io, work);
  +   blk = vbio-blk;
  +   vq = blk-dev.vqs[0];
  +   pos = vbio-sector  8;
  +
  +   use_mm(blk-dev.mm);
  +
  +   if (vbio-type  VIRTIO_BLK_T_FLUSH)  {
  +   ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1);
  +   } else if (vbio-type  VIRTIO_BLK_T_OUT) {
  +   ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos);
  +   } else {
  +   ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos);
  +   }
  +
  +   status = (ret  0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
  +   if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof 
  status)  0) {
  +   printk(copy to user failed\n);
  +   vhost_discard_vq_desc(vq);
  +   unuse_mm(blk-dev.mm);
  +   return;
 
 Do you need to kfree(vbio) here?

Yes. I do. As mentioned earlier, I haven't fixed error handling yet :(
 
  +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int 
  fd)
  +{
  +   struct file *file;
  +   struct vhost_virtqueue *vq;
  +
  +   file = fget(fd);
  +   if (!file)
  +   return -EBADF;
  +
  +   vq = n-vqs + index;
  +   mutex_lock(vq-mutex);
  +   rcu_assign_pointer(vq-private_data, file);
  +   mutex_unlock(vq-mutex);
  +   return 0;
  +}
  +
  +
  +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
  +unsigned long arg)
  +{
  +   struct vhost_blk *n = f-private_data;
  +   void __user *argp = (void __user *)arg;
  +   struct vhost_vring_file backend;
  +   int r;
  +
  +   switch (ioctl) {
  +case VHOST_NET_SET_BACKEND:
  +   r = copy_from_user(backend, argp, sizeof backend);
  +   if (r  0)
  +   return r;
  +   return vhost_blk_set_backend(n, backend.index, backend.fd);
 
 I don't see backend.index being checked against VHOST_BLK_VQ_MAX.

Yep. You are right. I will add these checks for my next revision.

Thanks,
Badari

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-30 Thread Avi Kivity

On 03/30/2010 01:51 AM, Badari Pulavarty wrote:


   

Your io wait time is twice as long and your throughput is about half.
I think the qmeu block submission does an extra attempt at merging
requests.  Does blktrace tell you anything interesting?

   

Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K
requests from the virtio ring and merging them back to 2M before
submitting them.

Unfortunately, I can't do that quite easily in vhost-blk. QEMU
does re-creates iovecs for the merged IO. I have to come up with
a scheme to do this :(
   


I don't think that either vhost-blk or virtio-blk should do this.  
Merging increases latency, and in the case of streaming writes makes it 
impossible for the guest to prepare new requests while earlier ones are 
being serviced (in effect it reduces the queue depth to 1).


qcow2 does benefit from merging, but it should do so itself without 
impacting raw.



It does.  I suggest using fio O_DIRECT random access patterns to avoid
such issues.
 

Well, I am not trying to come up with a test case where vhost-blk
performs better than virtio-blk. I am trying to understand where
and why vhost-blk performnce worse than virtio-blk.
   


In this case qemu-virtio is making an incorrect tradeoff.  The guest 
could easily merge those requests itself.  If you want larger writes, 
tune the guest to issue them.


Another way to look at it:  merging improved bandwidth but increased 
latency, yet you are only measuring bandwidth.  If you measured only 
latency you'd find that vhost-blk is better.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-29 Thread Badari Pulavarty
Hi Christoph,

I am wondering if you can provide your thoughts here..

I modified my vhost-blk implementation to offload work to
work_queues instead of doing synchronously. Infact, I tried
to spread the work across all the CPUs. But to my surprise,
this did not improve the performance compared to virtio-blk.

I see vhost-blk taking more interrupts and context switches
compared to virtio-blk. What is virtio-blk doing which I
am not able to from vhost-blk ???

Thanks,
Badari


vhost-blk

procs ---memory-- ---swap-- -io --system-- -cpu-
 r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa st
 3  1   8920  56076  20760 56035560  104   196 79826 17164 13912  0  5 65 
30  0
 2  4   9488  57216  20744 56056160  114   195 81120 17397 13824  0  5 65 
30  0
 2  2  10028  68476  20728 55947640  108   206 80318 17162 13845  0  5 65 
30  0
 0  4  10560  70856  20708 55930880  106   205 82363 17402 13904  0  5 65 
30  0
 1  3  10948  80380  20672 55844520   78   178 79714 17113 13875  0  5 66 
29  0

qemu virtio-blk:

procs ---memory-- ---swap-- -io --system-- -cpu-
 r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa st
 0  1  14124  57456   5144 492406000   139 142546 11287 9312  1  4 80 
15  0
 0  2  14124  56736   5148 492739600   146 142968 11283 9248  1  4 80 
15  0
 0  1  14124  56712   5384 49270200074 150738 11182 9327  1  4 80 
16  0
 1  1  14124  55496   5392 492790400 2 159902 11172 9401  1  3 79 
17  0
 0  1  14124  55968   5408 492723200 0 159202 11212 9325  1  3 80 
16  0

---
 drivers/vhost/blk.c |  310 
 1 file changed, 310 insertions(+)

Index: net-next/drivers/vhost/blk.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ net-next/drivers/vhost/blk.c2010-03-25 20:06:57.484054770 -0400
@@ -0,0 +1,310 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include linux/compat.h
+#include linux/eventfd.h
+#include linux/vhost.h
+#include linux/virtio_net.h
+#include linux/virtio_blk.h
+#include linux/mmu_context.h
+#include linux/miscdevice.h
+#include linux/module.h
+#include linux/mutex.h
+#include linux/workqueue.h
+#include linux/rcupdate.h
+#include linux/file.h
+
+#include vhost.h
+
+#define VHOST_BLK_VQ_MAX 1
+
+#if 0
+#define myprintk(fmt, ...) printk(pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define myprintk(fmt, ...)
+#endif
+
+struct vhost_blk {
+   struct vhost_dev dev;
+   struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+   struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+   struct work_struct work;
+   struct vhost_blk *blk;
+   struct file *file;
+   int head;
+   uint32_t type;
+   uint64_t sector;
+   struct iovec *iov;
+   int nvecs;
+};
+
+static struct workqueue_struct *vblk_workqueue;
+
+static void handle_io_work(struct work_struct *work)
+{
+   struct vhost_blk_io *vbio;
+   struct vhost_virtqueue *vq;
+   struct vhost_blk *blk;
+   int i, ret = 0;
+   loff_t pos;
+   uint8_t status = 0;
+
+   vbio = container_of(work, struct vhost_blk_io, work);
+   blk = vbio-blk;
+   vq = blk-dev.vqs[0];
+   pos = vbio-sector  8;
+
+   use_mm(blk-dev.mm);
+
+   if (vbio-type  VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1);
+   } else if (vbio-type  VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos);
+   } else {
+   ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos);
+   }
+
+   status = (ret  0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+   if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof 
status)  0) {
+   printk(copy to user failed\n);
+   vhost_discard_vq_desc(vq);
+   unuse_mm(blk-dev.mm);
+   return;
+   }
+   mutex_lock(vq-mutex);
+   vhost_add_used_and_signal(blk-dev, vq, vbio-head, ret);
+   mutex_unlock(vq-mutex);
+   unuse_mm(blk-dev.mm);
+   kfree(vbio);
+}
+
+static int cpu = 0;
+static int handoff_io(struct vhost_blk *blk, int head,
+   uint32_t type, uint64_t sector,
+   struct iovec *iov, int nvecs)
+{
+   struct vhost_virtqueue *vq = blk-dev.vqs[0];
+   struct vhost_blk_io *vbio;
+
+   vbio = kmalloc(sizeof(struct vhost_blk_io), GFP_KERNEL);
+   if (!vbio)
+   return -ENOMEM;
+
+   INIT_WORK(vbio-work, handle_io_work);
+   vbio-blk = blk;
+   vbio-file = vq-private_data;
+   vbio-head = head;
+   vbio-type = type;
+   vbio-sector = sector;
+   vbio-iov = iov;
+   vbio-nvecs = nvecs;
+
+   

Re: [RFC] vhost-blk implementation

2010-03-29 Thread Chris Wright
* Badari Pulavarty (pbad...@us.ibm.com) wrote:
 I modified my vhost-blk implementation to offload work to
 work_queues instead of doing synchronously. Infact, I tried
 to spread the work across all the CPUs. But to my surprise,
 this did not improve the performance compared to virtio-blk.
 
 I see vhost-blk taking more interrupts and context switches
 compared to virtio-blk. What is virtio-blk doing which I
 am not able to from vhost-blk ???

Your io wait time is twice as long and your throughput is about half.
I think the qmeu block submission does an extra attempt at merging
requests.  Does blktrace tell you anything interesting?

 procs ---memory-- ---swap-- -io --system-- 
 -cpu-
  r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa 
 st
  3  1   8920  56076  20760 56035560  104   196 79826 17164 13912  0  5 65 
 30  0
  2  4   9488  57216  20744 56056160  114   195 81120 17397 13824  0  5 65 
 30  0
  2  2  10028  68476  20728 55947640  108   206 80318 17162 13845  0  5 65 
 30  0
  0  4  10560  70856  20708 55930880  106   205 82363 17402 13904  0  5 65 
 30  0
  1  3  10948  80380  20672 55844520   78   178 79714 17113 13875  0  5 66 
 29  0
 
 qemu virtio-blk:
 
 procs ---memory-- ---swap-- -io --system-- 
 -cpu-
  r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa 
 st
  0  1  14124  57456   5144 492406000   139 142546 11287 9312  1  4 80 
 15  0
  0  2  14124  56736   5148 492739600   146 142968 11283 9248  1  4 80 
 15  0
  0  1  14124  56712   5384 49270200074 150738 11182 9327  1  4 80 
 16  0
  1  1  14124  55496   5392 492790400 2 159902 11172 9401  1  3 79 
 17  0
  0  1  14124  55968   5408 492723200 0 159202 11212 9325  1  3 80 
 16  0
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-29 Thread Avi Kivity

On 03/29/2010 09:20 PM, Chris Wright wrote:

* Badari Pulavarty (pbad...@us.ibm.com) wrote:
   

I modified my vhost-blk implementation to offload work to
work_queues instead of doing synchronously. Infact, I tried
to spread the work across all the CPUs. But to my surprise,
this did not improve the performance compared to virtio-blk.

I see vhost-blk taking more interrupts and context switches
compared to virtio-blk. What is virtio-blk doing which I
am not able to from vhost-blk ???
 

Your io wait time is twice as long and your throughput is about half.
I think the qmeu block submission does an extra attempt at merging
requests.  Does blktrace tell you anything interesting?
   


It does.  I suggest using fio O_DIRECT random access patterns to avoid 
such issues.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-29 Thread Badari Pulavarty
On Mon, 2010-03-29 at 23:37 +0300, Avi Kivity wrote:
 On 03/29/2010 09:20 PM, Chris Wright wrote:
  * Badari Pulavarty (pbad...@us.ibm.com) wrote:
 
  I modified my vhost-blk implementation to offload work to
  work_queues instead of doing synchronously. Infact, I tried
  to spread the work across all the CPUs. But to my surprise,
  this did not improve the performance compared to virtio-blk.
 
  I see vhost-blk taking more interrupts and context switches
  compared to virtio-blk. What is virtio-blk doing which I
  am not able to from vhost-blk ???
   
  Your io wait time is twice as long and your throughput is about half.
  I think the qmeu block submission does an extra attempt at merging
  requests.  Does blktrace tell you anything interesting?
 

Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K
requests from the virtio ring and merging them back to 2M before
submitting them. 

Unfortunately, I can't do that quite easily in vhost-blk. QEMU
does re-creates iovecs for the merged IO. I have to come up with
a scheme to do this :(

 It does.  I suggest using fio O_DIRECT random access patterns to avoid 
 such issues.

Well, I am not trying to come up with a test case where vhost-blk
performs better than virtio-blk. I am trying to understand where
and why vhost-blk performnce worse than virtio-blk.


Thanks,
Badari


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-29 Thread Chris Wright
* Badari Pulavarty (pbad...@us.ibm.com) wrote:
 On Mon, 2010-03-29 at 23:37 +0300, Avi Kivity wrote:
  On 03/29/2010 09:20 PM, Chris Wright wrote:
   Your io wait time is twice as long and your throughput is about half.
   I think the qmeu block submission does an extra attempt at merging
   requests.  Does blktrace tell you anything interesting?
 
 Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K
 requests from the virtio ring and merging them back to 2M before
 submitting them. 
 
 Unfortunately, I can't do that quite easily in vhost-blk. QEMU
 does re-creates iovecs for the merged IO. I have to come up with
 a scheme to do this :(

Is close cooperator logic kicking in at all?  Alternatively, using same
io_context.

  It does.  I suggest using fio O_DIRECT random access patterns to avoid 
  such issues.
 
 Well, I am not trying to come up with a test case where vhost-blk
 performs better than virtio-blk. I am trying to understand where
 and why vhost-blk performnce worse than virtio-blk.

It would just level the playing field.  Alternatively, commenting out
merging in qemu to further validate it's the source, something as simple
as this in block.c:

-num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+//num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);


Although, from above, sounds like you've already verified this is the
difference.  IIRC, the sync write path is at odds w/ cfq elevator merging
(so cache=none would have trouble, but you were doing cache=writeback,
right?), but if you can hack your worker threads to share an io_context,
like you had done CLONE_IO,  you might get some merging back (again,
just a hack to pinpoint merging as the culprit).

thanks,
-chris
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-26 Thread Eran Rom
Christoph Hellwig hch at infradead.org writes:


 Ok.  cache=writeback performance is something I haven't bothered looking
 at at all.  For cache=none any streaming write or random workload with
 large enough record sizes got basically the same performance as native
 using kernel aio, and same for write but slightly degraded for reads
 using the thread pool.  See my attached JLS presentation for some
 numbers.
 
Looks like the presentation did not make it...
Thanks,
Eran

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Avi Kivity

On 03/24/2010 10:05 PM, Christoph Hellwig wrote:

On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote:
   

I also think it should be done at the bio layer.  File I/O is going to
be slower, if we do vhost-blk we should concentrate on maximum
performance.  The block layer also exposes more functionality we can use
(asynchronous barriers for example).
 

The block layer is more flexible, but that limits you to only stack
directly ontop of a block device, which is extremly inflexible.
   


We still have a virtio implementation in userspace for file-based images.

In any case, the file APIs are not asynchronous so we'll need a thread 
pool.  That will probably minimize the difference in performance between 
the userspace and kernel implementations.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Avi Kivity

On 03/24/2010 10:22 PM, Badari Pulavarty wrote:

Which caching mode is this?  I assume data=writeback, because otherwise
you'd be doing synchronous I/O directly from the handler.



Yes. This is with default (writeback) cache model. As mentioned 
earlier, readhead is helping here

and most cases, data would be ready in the pagecache.



btw, relying on readahead is problematic.  The guest already issues 
readahead requests, and the host will issue readahead based on that 
readahead, reading far into the future.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Badari Pulavarty

Avi Kivity wrote:

On 03/24/2010 10:22 PM, Badari Pulavarty wrote:

Which caching mode is this?  I assume data=writeback, because otherwise
you'd be doing synchronous I/O directly from the handler.



Yes. This is with default (writeback) cache model. As mentioned 
earlier, readhead is helping here

and most cases, data would be ready in the pagecache.



btw, relying on readahead is problematic.  The guest already issues 
readahead requests, and the host will issue readahead based on that 
readahead, reading far into the future.


Sure. In my tests, I did O_DIRECT in the guest - so there is no 
readahead in the guest.


Thanks,
Badari

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Asdo

Avi Kivity wrote:

For the case where the file is actually a partition, use 
submit_bio().  When the file is a file, keep it in qemu, that path is 
going to be slower anyway. 

[CUT]

Christoph Hellwig wrote:

On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote:
  
I also think it should be done at the bio layer.  File I/O is going to  
be slower, if we do vhost-blk we should concentrate on maximum  
performance.  The block layer also exposes more functionality we can use  
(asynchronous barriers for example).



The block layer is more flexible, but that limits you to only stack
directly ontop of a block device, which is extremly inflexible.
  


Would the loop device provide the features of a block device? I recall 
barrier support at least has been added recently.
Is it recommended to run kvm on a loopback mounted file compared to on a 
raw file?


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Christoph Hellwig
On Thu, Mar 25, 2010 at 08:29:03AM +0200, Avi Kivity wrote:
 We still have a virtio implementation in userspace for file-based images.

 In any case, the file APIs are not asynchronous so we'll need a thread  
 pool.  That will probably minimize the difference in performance between  
 the userspace and kernel implementations.

The kernel has real aio when doing direct I/O, although currently we
can't easily use it from kernelspace.  Thejre's a few people who've been
looking into making it usable for that, though.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Avi Kivity

On 03/25/2010 05:48 PM, Christoph Hellwig wrote:

On Thu, Mar 25, 2010 at 08:29:03AM +0200, Avi Kivity wrote:
   

We still have a virtio implementation in userspace for file-based images.

In any case, the file APIs are not asynchronous so we'll need a thread
pool.  That will probably minimize the difference in performance between
the userspace and kernel implementations.
 

The kernel has real aio when doing direct I/O, although currently we
can't easily use it from kernelspace.  Thejre's a few people who've been
looking into making it usable for that, though.
   


Interesting.  Are there limitations on supported filesystems, or 
allocating vs. non-allocating writes?


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-25 Thread Christoph Hellwig
On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote:
 Yes. This is with default (writeback) cache model. As mentioned earlier,  
 readhead is helping here
 and most cases, data would be ready in the pagecache.

Ok.  cache=writeback performance is something I haven't bothered looking
at at all.  For cache=none any streaming write or random workload with
large enough record sizes got basically the same performance as native
using kernel aio, and same for write but slightly degraded for reads
using the thread pool.  See my attached JLS presentation for some
numbers.

   
 iovecs and buffers are user-space pointers (from the host kernel point  
 of view). They are
 guest address. So, I don't need to do any set_fs tricks.

Right now they're not ceclared as such, so sparse would complain.

 Yes. QEMU virtio-blk is batching up all the writes and handing of the  
 work to another
 thread.

If using the thread pool.  If using kernel aio it performs the io_submit
system call directly.

 What do should I do here ? I can create bunch of kernel threads to do  
 the IO for me.
 Or some how fit and reuse AIO io_submit() mechanism. Whats the best way  
 here ?
 I hate do duplicate all the code VFS is doing.

The only thing you can do currently is adding a thread pool.  It might
be able to use io_submit for the O_DIRECT case with some refactoring.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-24 Thread Christoph Hellwig
 Inspired by vhost-net implementation, I did initial prototype 
 of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
 I haven't handled all the error cases, fixed naming conventions etc.,
 but the implementation is stable to play with. I tried not to deviate
 from vhost-net implementation where possible.

Can you also send the qemu side of it?

 with vhost-blk:
 
 
 # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
 64+0 records in
 64+0 records out
 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s
 
 real2m6.137s
 user0m0.281s
 sys 0m14.725s
 
 without vhost-blk: (virtio)
 ---
 
 # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
 64+0 records in
 64+0 records out
 8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s
 
 real4m35.468s
 user0m0.373s
 sys 0m48.074s

Which caching mode is this?  I assume data=writeback, because otherwise
you'd be doing synchronous I/O directly from the handler.

 +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
 + struct iovec *iov, int in)
 +{
 + loff_t pos = sector  8;
 + int ret = 0;
 +
 + if (type  VIRTIO_BLK_T_FLUSH)  {
 + ret = vfs_fsync(file, file-f_path.dentry, 1);
 + } else if (type  VIRTIO_BLK_T_OUT) {
 + ret = vfs_writev(file, iov, in, pos);
 + } else {
 + ret = vfs_readv(file, iov, in, pos);
 + }
 + return ret;

I have to admit I don't understand the vhost architecture at all, but
where do the actual data pointers used by the iovecs reside?
vfs_readv/writev expect both the iovec itself and the buffers
pointed to by it to reside in userspace, so just using kernel buffers
here will break badly on architectures with different user/kernel
mappings.  A lot of this is fixable using simple set_fs  co tricks,
but for direct I/O which uses get_user_pages even that will fail badly.

Also it seems like you're doing all the I/O synchronous here?  For
data=writeback operations that could explain the read speedup
as you're avoiding context switches, but for actual write I/O
which has to get data to disk (either directly from vfs_writev or
later through vfs_fsync) this seems like a really bad idea stealing
a lot of guest time that should happen in the background.


Other than that the code seems quite nice and simple, but one huge
problem is that it'll only support raw images, and thus misses out
on all the nice image formats used in qemu deployments, especially
qcow2.  It's also missing the ioctl magic we're having in various
places, both for controlling host devices like cdroms and SG
passthrough.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-24 Thread Christoph Hellwig
On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote:
 I also think it should be done at the bio layer.  File I/O is going to  
 be slower, if we do vhost-blk we should concentrate on maximum  
 performance.  The block layer also exposes more functionality we can use  
 (asynchronous barriers for example).

The block layer is more flexible, but that limits you to only stack
directly ontop of a block device, which is extremly inflexible.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-24 Thread Badari Pulavarty

Christoph Hellwig wrote:
Inspired by vhost-net implementation, I did initial prototype 
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.

I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.



Can you also send the qemu side of it?

  

with vhost-blk:


# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s

real2m6.137s
user0m0.281s
sys 0m14.725s

without vhost-blk: (virtio)
---

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s

real4m35.468s
user0m0.373s
sys 0m48.074s



Which caching mode is this?  I assume data=writeback, because otherwise
you'd be doing synchronous I/O directly from the handler.
  


Yes. This is with default (writeback) cache model. As mentioned earlier, 
readhead is helping here

and most cases, data would be ready in the pagecache.
  

+static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
+   struct iovec *iov, int in)
+{
+   loff_t pos = sector  8;
+   int ret = 0;
+
+   if (type  VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(file, file-f_path.dentry, 1);
+   } else if (type  VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(file, iov, in, pos);
+   } else {
+   ret = vfs_readv(file, iov, in, pos);
+   }
+   return ret;



I have to admit I don't understand the vhost architecture at all, but
where do the actual data pointers used by the iovecs reside?
vfs_readv/writev expect both the iovec itself and the buffers
pointed to by it to reside in userspace, so just using kernel buffers
here will break badly on architectures with different user/kernel
mappings.  A lot of this is fixable using simple set_fs  co tricks,
but for direct I/O which uses get_user_pages even that will fail badly.
  
iovecs and buffers are user-space pointers (from the host kernel point 
of view). They are

guest address. So, I don't need to do any set_fs tricks.

Also it seems like you're doing all the I/O synchronous here?  For
data=writeback operations that could explain the read speedup
as you're avoiding context switches, but for actual write I/O
which has to get data to disk (either directly from vfs_writev or
later through vfs_fsync) this seems like a really bad idea stealing
a lot of guest time that should happen in the background.
  
Yes. QEMU virtio-blk is batching up all the writes and handing of the 
work to another
thread. When the writes() are complete, its sending a status completion. 
Since I am
doing everything synchronous (even though its write to pagecache) one 
request at a

time, that explains the slow down. We need to find a way to

1) batch IO writes together
2) hand off to another thread to do the IO, so that vhost-thread can handle
next set of requests
3) update the status on the completion

What do should I do here ? I can create bunch of kernel threads to do 
the IO for me.
Or some how fit and reuse AIO io_submit() mechanism. Whats the best way 
here ?

I hate do duplicate all the code VFS is doing.


Other than that the code seems quite nice and simple, but one huge
problem is that it'll only support raw images, and thus misses out
on all the nice image formats used in qemu deployments, especially
qcow2.  It's also missing the ioctl magic we're having in various
places, both for controlling host devices like cdroms and SG
passthrough.
  
True... unfortunately, I don't understand all of those (qcow2) details 
yet !! I need to read up on those,

to even make a comment :(

Thanks,
Badari


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-24 Thread Badari Pulavarty

Christoph Hellwig wrote:
Inspired by vhost-net implementation, I did initial prototype 
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.

I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.



Can you also send the qemu side of it?
  
Its pretty hacky and based it on old patch (vhost-net) from MST for 
simplicity.
I haven't focused on cleaning it up and I will re-base it on MST's 
latest code

once it gets into QEMU.

Thanks,
Badari

---
hw/virtio-blk.c |  199 
1 file changed, 199 insertions(+)

Index: vhost/hw/virtio-blk.c
===
--- vhost.orig/hw/virtio-blk.c  2010-02-25 16:47:04.0 -0500
+++ vhost/hw/virtio-blk.c   2010-03-17 14:07:26.477430740 -0400
@@ -18,6 +18,7 @@
#ifdef __linux__
# include scsi/sg.h
#endif
+#include kvm.h

typedef struct VirtIOBlock
{
@@ -28,8 +29,13 @@
char serial_str[BLOCK_SERIAL_STRLEN + 1];
QEMUBH *bh;
size_t config_size;
+uint8_t vhost_started;
} VirtIOBlock;

+typedef struct BDRVRawState {
+int fd;
+} BDRVRawState;
+
static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
{
return (VirtIOBlock *)vdev;
@@ -501,6 +507,198 @@
return 0;
}

+#if 1
+#include linux/vhost.h
+#include sys/ioctl.h
+#include sys/eventfd.h
+#include vhost.h
+
+int vhost_blk_fd;
+
+struct slot_info {
+unsigned long phys_addr;
+unsigned long len;
+unsigned long userspace_addr;
+unsigned flags;
+int logging_count;
+};
+
+extern struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS];
+
+static int vhost_blk_start(struct VirtIODevice *vdev)
+{
+   target_phys_addr_t s, l, a;
+   int r, num, idx = 0;
+   struct vhost_vring_state state;
+   struct vhost_vring_file file;
+   struct vhost_vring_addr addr;
+   unsigned long long used_phys;
+   void *desc, *avail, *used;
+   int i, n =0;
+   struct VirtQueue *q = virtio_queue(vdev, idx);
+   VirtIOBlock *vb = to_virtio_blk(vdev);
+   struct vhost_memory *mem;
+   BDRVRawState *st = vb-bs-opaque;
+
+   vhost_blk_fd = open(/dev/vhost-blk, O_RDWR);
+   if (vhost_blk_fd  0) {
+   fprintf(stderr, unable to open vhost-blk\n);
+   return -errno;
+   }
+
+   r = ioctl(vhost_blk_fd, VHOST_SET_OWNER, NULL);
+if (r  0) {
+   fprintf(stderr, ioctl VHOST_SET_OWNER failed\n);
+return -errno;
+   }
+
+for (i = 0; i  KVM_MAX_NUM_MEM_REGIONS; ++i) {
+if (!slots[i].len ||
+   (slots[i].flags  KVM_MEM_LOG_DIRTY_PAGES)) {
+  continue;
+}
+++n;
+}
+
+mem = qemu_mallocz(offsetof(struct vhost_memory, regions) +
+   n * sizeof(struct vhost_memory_region));
+if (!mem)
+return -ENOMEM;
+
+mem-nregions = n;
+n = 0;
+for (i = 0; i  KVM_MAX_NUM_MEM_REGIONS; ++i) {
+if (!slots[i].len || (slots[i].flags 
+   KVM_MEM_LOG_DIRTY_PAGES)) {
+continue;
+}
+mem-regions[n].guest_phys_addr = slots[i].phys_addr;
+mem-regions[n].memory_size = slots[i].len;
+mem-regions[n].userspace_addr = slots[i].userspace_addr;
+++n;
+}
+
+r = ioctl(vhost_blk_fd, VHOST_SET_MEM_TABLE, mem);
+if (r  0)
+return -errno;
+
+   state.index = idx;
+   num = state.num = virtio_queue_get_num(vdev, idx);
+   r = ioctl(vhost_blk_fd, VHOST_SET_VRING_NUM, state);
+if (r) {
+   fprintf(stderr, ioctl VHOST_SET_VRING_NUM failed\n);
+return -errno;
+}
+
+   state.num = virtio_queue_last_avail_idx(vdev, idx);
+   r = ioctl(vhost_blk_fd, VHOST_SET_VRING_BASE, state);
+   if (r) {
+   fprintf(stderr, ioctl VHOST_SET_VRING_BASE failed\n);
+return -errno;
+   }
+
+   s = l = sizeof(struct vring_desc) * num;
+   a = virtio_queue_get_desc(vdev, idx);
+   desc = cpu_physical_memory_map(a, l, 0);
+   if (!desc || l != s) {
+r = -ENOMEM;
+goto fail_alloc;
+   }
+   s = l = offsetof(struct vring_avail, ring) +
+sizeof(u_int64_t) * num;
+a = virtio_queue_get_avail(vdev, idx);
+avail = cpu_physical_memory_map(a, l, 0);
+if (!avail || l != s) {
+r = -ENOMEM;
+goto fail_alloc;
+}
+s = l = offsetof(struct vring_used, ring) +
+sizeof(struct vring_used_elem) * num;
+used_phys = a = virtio_queue_get_used(vdev, idx);
+used = 

Re: [RFC] vhost-blk implementation

2010-03-23 Thread Avi Kivity

On 03/23/2010 03:00 AM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier
   


[RFC] vhost-blk implementation.eml

Subject:
[RFC] vhost-blk implementation
From:
Badari Pulavarty pbad...@us.ibm.com
Date:
Mon, 22 Mar 2010 17:34:06 -0700

To:
virtualizat...@lists.linux-foundation.org, qemu-de...@nongnu.org


Hi,

Inspired by vhost-net implementation, I did initial prototype
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.

NOTE:  Only change I had to make to vhost core code is to
increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h

Performance:
=

I have done simple tests to see how it performs. I got very
encouraging results on sequential read tests. But on sequential
write tests, I see degrade over virtio-blk. I can't figure out and
explain why. Can some one shed light on whats happening here ?

Read Results:
=
Test does read of 84GB file from the host (through virtio). I unmount
and mount the filesystem on the host to make sure there is nothing
in the page cache..

+#define VHOST_BLK_VQ_MAX 1
+
+struct vhost_blk {
+   struct vhost_dev dev;
+   struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+   struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
+   struct iovec *iov, int in)
+{
+   loff_t pos = sector  8;
+   int ret = 0;
+
+   if (type  VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(file, file-f_path.dentry, 1);
+   } else if (type  VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(file, iov, in,pos);
+   } else {
+   ret = vfs_readv(file, iov, in,pos);
+   }
+   return ret;
+}
   


This should be done asynchronously.  That is likely the cause of write 
performance degradation.  For reads, readahead means that that you're 
async anyway, but writes/syncs are still synchronous.


I also think it should be done at the bio layer.  File I/O is going to 
be slower, if we do vhost-blk we should concentrate on maximum 
performance.  The block layer also exposes more functionality we can use 
(asynchronous barriers for example).


btw, for fairness, cpu measurements should be done from the host side 
and include the vhost thread.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-23 Thread Avi Kivity

On 03/23/2010 04:50 AM, Badari Pulavarty wrote:

Anthony Liguori wrote:

On 03/22/2010 08:45 PM, Badari Pulavarty wrote:

Anthony Liguori wrote:

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier



These virtio results are still with a 2.6.18 kernel with no aio, 
right?

Results are on 2.6.33-rc8-net-next kernel. But not using AIO.


Can you try with aio and cache=none?


aio-native,cache=none

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 470.588 seconds, 170 MB/s



You're effectively killing readahead with this.  Please use fio with a 
sequential pattern, queue depth  1.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-23 Thread Eran Rom
 with vhost-blk:
 
 
 # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
 64+0 records in
 64+0 records out
 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s
 
 # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct
 
Can you please check both read  write with cache=off, bs=128k 
Thanks very much,
Eran 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-23 Thread Badari Pulavarty

Avi Kivity wrote:

On 03/23/2010 04:50 AM, Badari Pulavarty wrote:

Anthony Liguori wrote:

On 03/22/2010 08:45 PM, Badari Pulavarty wrote:

Anthony Liguori wrote:

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier



These virtio results are still with a 2.6.18 kernel with no aio, 
right?

Results are on 2.6.33-rc8-net-next kernel. But not using AIO.


Can you try with aio and cache=none?


aio-native,cache=none

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 470.588 seconds, 170 MB/s



You're effectively killing readahead with this.  


Yes. cache=none, aio=native will force it hit the disk all the time. We 
loose all the benefits of

readahead.

Please use fio with a sequential pattern, queue depth  1.


Will do.

Thanks,
Badari

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-23 Thread Badari Pulavarty

Avi Kivity wrote:

On 03/23/2010 03:00 AM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier
  
[RFC] vhost-blk implementation.eml


Subject:
[RFC] vhost-blk implementation
From:
Badari Pulavarty pbad...@us.ibm.com
Date:
Mon, 22 Mar 2010 17:34:06 -0700

To:
virtualizat...@lists.linux-foundation.org, qemu-de...@nongnu.org


Hi,

Inspired by vhost-net implementation, I did initial prototype
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.

NOTE:  Only change I had to make to vhost core code is to
increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h

Performance:
=

I have done simple tests to see how it performs. I got very
encouraging results on sequential read tests. But on sequential
write tests, I see degrade over virtio-blk. I can't figure out and
explain why. Can some one shed light on whats happening here ?

Read Results:
=
Test does read of 84GB file from the host (through virtio). I unmount
and mount the filesystem on the host to make sure there is nothing
in the page cache..

+#define VHOST_BLK_VQ_MAX 1
+
+struct vhost_blk {
+struct vhost_dev dev;
+struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+static int do_handle_io(struct file *file, uint32_t type, uint64_t 
sector,

+struct iovec *iov, int in)
+{
+loff_t pos = sector  8;
+int ret = 0;
+
+if (type  VIRTIO_BLK_T_FLUSH)  {
+ret = vfs_fsync(file, file-f_path.dentry, 1);
+} else if (type  VIRTIO_BLK_T_OUT) {
+ret = vfs_writev(file, iov, in,pos);
+} else {
+ret = vfs_readv(file, iov, in,pos);
+}
+return ret;
+}
   


This should be done asynchronously.  That is likely the cause of write 
performance degradation.  For reads, readahead means that that you're 
async anyway, but writes/syncs are still synchronous.
I am not sure what you mean by async here. Even if I use 
f_op-aio_write() its still synchronous (except for DIO).  Since we are 
writing to pagecache and not waiting for write()

to complete, this is the best we can do here.

Do you mean offload write() handling to another thread ?


I also think it should be done at the bio layer.  
I am not sure what you meant here. Do you want to do submit_bio() 
directly ? Its not going to be that simple. Since the sector# is offset 
within the file, one have to do getblocks() on it
to find the real-disk-block#s + we have to do get_user_pages() on these 
iovecs before submitting them to bio.. All of this work is done by 
vfs_write()/vfs_read() anyway.. I am not

sure what you are suggesting here..

File I/O is going to be slower, if we do vhost-blk we should 
concentrate on maximum performance.  The block layer also exposes more 
functionality we can use (asynchronous barriers for example).




btw, for fairness, cpu measurements should be done from the host side 
and include the vhost thread.



Will do.

Thanks,
Badari

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-23 Thread Avi Kivity

On 03/23/2010 04:55 PM, Badari Pulavarty wrote:
This should be done asynchronously.  That is likely the cause of 
write performance degradation.  For reads, readahead means that that 
you're async anyway, but writes/syncs are still synchronous.


I am not sure what you mean by async here. Even if I use 
f_op-aio_write() its still synchronous (except for DIO).  Since we 
are writing to pagecache and not waiting for write()

to complete, this is the best we can do here.


It fails with O_DIRECT or O_DSYNC, or if the write cache is full and we 
need to wait on writeout.  Reads that don't hit pagecache will also be 
synchronous.  You could use threads (like qemu) or use the block layer 
(which is async).




Do you mean offload write() handling to another thread ?


Yeah.  Read too.



I also think it should be done at the bio layer. 
I am not sure what you meant here. Do you want to do submit_bio() 
directly ? Its not going to be that simple. Since the sector# is 
offset within the file, one have to do getblocks() on it
to find the real-disk-block#s + we have to do get_user_pages() on 
these iovecs before submitting them to bio.. All of this work is done 
by vfs_write()/vfs_read() anyway.. I am not

sure what you are suggesting here..


For the case where the file is actually a partition, use submit_bio().  
When the file is a file, keep it in qemu, that path is going to be 
slower anyway.



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] vhost-blk implementation

2010-03-22 Thread Badari Pulavarty
Forgot to CC: KVM list earlier
---BeginMessage---
Hi,

Inspired by vhost-net implementation, I did initial prototype 
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.

NOTE:  Only change I had to make to vhost core code is to 
increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h 

Performance:
=

I have done simple tests to see how it performs. I got very
encouraging results on sequential read tests. But on sequential
write tests, I see degrade over virtio-blk. I can't figure out and
explain why. Can some one shed light on whats happening here ?

Read Results:
=
Test does read of 84GB file from the host (through virtio). I unmount
and mount the filesystem on the host to make sure there is nothing
in the page cache..


with vhost-blk:


# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s

real2m6.137s
user0m0.281s
sys 0m14.725s

without vhost-blk: (virtio)
---

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s

real4m35.468s
user0m0.373s
sys 0m48.074s



Write Results:
==

I see degraded IO performance when doing sequential IO write
tests with vhost-blk compared to virtio-blk.

# time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct

I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
vhost-blk. Wondering why ?

Comments/flames ? 

Thanks,
Badari


vhost-blk is in-kernel accelerator for virtio-blk. 
At this time, this is a prototype based on virtio-net.
Lots of error handling and clean up needs to be done.
Read performance is pretty good over QEMU virtio-blk, but
write performance is not anywhere close to QEMU virtio-blk.
Why ?

Signed-off-by: Badari Pulavarty pbad...@us.ibm.com
---
 drivers/vhost/blk.c |  242 
 1 file changed, 242 insertions(+)

Index: net-next/drivers/vhost/blk.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ net-next/drivers/vhost/blk.c2010-03-22 18:07:18.156584400 -0400
@@ -0,0 +1,242 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include linux/compat.h
+#include linux/eventfd.h
+#include linux/vhost.h
+#include linux/virtio_net.h
+#include linux/virtio_blk.h
+#include linux/mmu_context.h
+#include linux/miscdevice.h
+#include linux/module.h
+#include linux/mutex.h
+#include linux/workqueue.h
+#include linux/rcupdate.h
+#include linux/file.h
+
+#include vhost.h
+
+#define VHOST_BLK_VQ_MAX 1
+
+struct vhost_blk {
+   struct vhost_dev dev;
+   struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+   struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
+   struct iovec *iov, int in)
+{
+   loff_t pos = sector  8;
+   int ret = 0;
+
+   if (type  VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(file, file-f_path.dentry, 1);
+   } else if (type  VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(file, iov, in, pos);
+   } else {
+   ret = vfs_readv(file, iov, in, pos);
+   }
+   return ret;
+}
+
+static void handle_blk(struct vhost_blk *blk)
+{
+   struct vhost_virtqueue *vq = blk-dev.vqs[0];
+   unsigned head, out, in;
+   struct virtio_blk_outhdr hdr;
+   int r, nvecs;
+   uint8_t status = 0;
+
+   use_mm(blk-dev.mm);
+   mutex_lock(vq-mutex);
+
+   vhost_disable_notify(vq);
+
+   for (;;) {
+   head = vhost_get_vq_desc(blk-dev, vq, vq-iov,
+ARRAY_SIZE(vq-iov),
+out, in, NULL, NULL);
+   if (head == vq-num) {
+   if (unlikely(vhost_enable_notify(vq))) {
+   vhost_disable_notify(vq);
+   continue;
+   }
+   break;
+   }
+
+   BUG_ON(vq-iov[0].iov_len != 16);
+
+   r = copy_from_user(hdr, vq-iov[0].iov_base, sizeof hdr);
+   if (r  0) {
+   printk(copy from user failed\n);
+   vhost_discard_vq_desc(vq);
+   break;
+   }
+
+   nvecs = out - 1;
+   if (hdr.type == VIRTIO_BLK_T_IN)
+   nvecs = in - 1;
+
+   r = do_handle_io(vq-private_data, hdr.type, hdr.sector, 
vq-iov[1], nvecs);
+   

Re: [RFC] vhost-blk implementation

2010-03-22 Thread Anthony Liguori

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier
   



These virtio results are still with a 2.6.18 kernel with no aio, right?

Regards,

Anthony LIguori
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-22 Thread Badari Pulavarty

Anthony Liguori wrote:

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier
   



These virtio results are still with a 2.6.18 kernel with no aio, right?

Results are on 2.6.33-rc8-net-next kernel. But not using AIO.

Thanks,
Badari


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-22 Thread Anthony Liguori

On 03/22/2010 08:45 PM, Badari Pulavarty wrote:

Anthony Liguori wrote:

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier



These virtio results are still with a 2.6.18 kernel with no aio, right?

Results are on 2.6.33-rc8-net-next kernel. But not using AIO.


Can you try with aio and cache=none?

Regards,

Anthony Liguori


Thanks,
Badari




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-03-22 Thread Badari Pulavarty

Anthony Liguori wrote:

On 03/22/2010 08:45 PM, Badari Pulavarty wrote:

Anthony Liguori wrote:

On 03/22/2010 08:00 PM, Badari Pulavarty wrote:

Forgot to CC: KVM list earlier



These virtio results are still with a 2.6.18 kernel with no aio, right?

Results are on 2.6.33-rc8-net-next kernel. But not using AIO.


Can you try with aio and cache=none?


aio-native,cache=none

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 470.588 seconds, 170 MB/s

Thanks,
Badari




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html