Re: [RFC] vhost-blk implementation
On Fri, Mar 26, 2010 at 6:53 PM, Eran Rom er...@il.ibm.com wrote: Christoph Hellwig hch at infradead.org writes: Ok. cache=writeback performance is something I haven't bothered looking at at all. For cache=none any streaming write or random workload with large enough record sizes got basically the same performance as native using kernel aio, and same for write but slightly degraded for reads using the thread pool. See my attached JLS presentation for some numbers. Looks like the presentation did not make it... I am interested in the JLS presentation too. Here is what I found, hope it's the one you meant, Christoph: http://events.linuxfoundation.org/images/stories/slides/jls09/jls09_hellwig.odp Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC] vhost-blk implementation (v2)
Hi All, Here is the latest version of vhost-blk implementation. Major difference from my previous implementation is that, I now merge all contiguous requests (both read and write), before submitting them. This significantly improved IO performance. I am still collecting performance numbers, I will be posting in next few days. Comments ? Todo: - Address hch's comments on annontations - Implement per device read/write queues - Finish up error handling Thanks, Badari --- drivers/vhost/blk.c | 445 1 file changed, 445 insertions(+) Index: net-next/drivers/vhost/blk.c === --- /dev/null 1970-01-01 00:00:00.0 + +++ net-next/drivers/vhost/blk.c2010-04-06 16:38:03.563847905 -0400 @@ -0,0 +1,445 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include linux/compat.h +#include linux/eventfd.h +#include linux/vhost.h +#include linux/virtio_net.h +#include linux/virtio_blk.h +#include linux/mmu_context.h +#include linux/miscdevice.h +#include linux/module.h +#include linux/mutex.h +#include linux/workqueue.h +#include linux/rcupdate.h +#include linux/file.h + +#include vhost.h + +#define VHOST_BLK_VQ_MAX 1 +#define SECTOR_SHIFT 9 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +struct vhost_blk_io { + struct list_head list; + struct work_struct work; + struct vhost_blk *blk; + struct file *file; + int head; + uint32_t type; + uint32_t nvecs; + uint64_t sector; + uint64_t len; + struct iovec iov[0]; +}; + +static struct workqueue_struct *vblk_workqueue; +static LIST_HEAD(write_queue); +static LIST_HEAD(read_queue); + +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio, *entry; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + struct list_head single, *head, *node, *tmp; + + int i, need_free, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio-blk; + vq = blk-dev.vqs[0]; + pos = vbio-sector 8; + + use_mm(blk-dev.mm); + if (vbio-type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1); + } else if (vbio-type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos); + } else { + ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos); + } + status = (ret 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (vbio-head != -1) { + INIT_LIST_HEAD(single); + list_add(vbio-list, single); + head = single; + need_free = 0; + } else { + head = vbio-list; + need_free = 1; + } + list_for_each_entry(entry, head, list) { + copy_to_user(entry-iov[entry-nvecs].iov_base, status, sizeof status); + } + mutex_lock(vq-mutex); + list_for_each_safe(node, tmp, head) { + entry = list_entry(node, struct vhost_blk_io, list); + vhost_add_used_and_signal(blk-dev, vq, entry-head, ret); + list_del(node); + kfree(entry); + } + mutex_unlock(vq-mutex); + unuse_mm(blk-dev.mm); + if (need_free) + kfree(vbio); +} + +static struct vhost_blk_io *allocate_vbio(int nvecs) +{ + struct vhost_blk_io *vbio; + int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec); + vbio = kmalloc(size, GFP_KERNEL); + if (vbio) { + INIT_WORK(vbio-work, handle_io_work); + INIT_LIST_HEAD(vbio-list); + } + return vbio; +} + +static void merge_and_handoff_work(struct list_head *queue) +{ + struct vhost_blk_io *vbio, *entry; + int nvecs = 0; + int entries = 0; + + list_for_each_entry(entry, queue, list) { + nvecs += entry-nvecs; + entries++; + } + + if (entries == 1) { + vbio = list_first_entry(queue, struct vhost_blk_io, list); + list_del(vbio-list); + queue_work(vblk_workqueue, vbio-work); + return; + } + + vbio = allocate_vbio(nvecs); + if (!vbio) { + /* Unable to allocate memory - submit IOs individually */ + list_for_each_entry(vbio, queue, list) { + queue_work(vblk_workqueue, vbio-work); + } + INIT_LIST_HEAD(queue); + return; + } + + entry = list_first_entry(queue, struct vhost_blk_io, list); + vbio-nvecs = nvecs; + vbio-blk = entry-blk; +
Re: [RFC] vhost-blk implementation
On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty pbad...@us.ibm.com wrote: +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + int i, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio-blk; + vq = blk-dev.vqs[0]; + pos = vbio-sector 8; + + use_mm(blk-dev.mm); + + if (vbio-type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1); + } else if (vbio-type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos); + } else { + ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos); + } + + status = (ret 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof status) 0) { + printk(copy to user failed\n); + vhost_discard_vq_desc(vq); + unuse_mm(blk-dev.mm); + return; Do you need to kfree(vbio) here? +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd) +{ + struct file *file; + struct vhost_virtqueue *vq; + + file = fget(fd); + if (!file) + return -EBADF; + + vq = n-vqs + index; + mutex_lock(vq-mutex); + rcu_assign_pointer(vq-private_data, file); + mutex_unlock(vq-mutex); + return 0; +} + + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_blk *n = f-private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + int r; + + switch (ioctl) { + case VHOST_NET_SET_BACKEND: + r = copy_from_user(backend, argp, sizeof backend); + if (r 0) + return r; + return vhost_blk_set_backend(n, backend.index, backend.fd); I don't see backend.index being checked against VHOST_BLK_VQ_MAX. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote: iovecs and buffers are user-space pointers (from the host kernel point of view). They are guest address. So, I don't need to do any set_fs tricks. From verifying the code and using the sparse annotations it appears that the actual buffers are userspace pointers, but the iovecs in the virtqueue are kernel level pointers, so you would need some annotations. While we're at it here is a patch fixing the remaining sparse warnings in vhost-blk: Signed-off-by: Christoph Hellwig h...@lst.de Index: linux-2.6/drivers/vhost/blk.c === --- linux-2.6.orig/drivers/vhost/blk.c 2010-04-05 21:15:11.638004250 +0200 +++ linux-2.6/drivers/vhost/blk.c 2010-04-05 21:16:13.238004599 +0200 @@ -86,7 +86,7 @@ static void handle_blk(struct vhost_blk nvecs++; BUG_ON(vq-iov[nvecs].iov_len != 1); - if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof status) 0) { + if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof status)) { printk(copy to user failed\n); vhost_discard_vq_desc(vq); break; @@ -199,7 +199,7 @@ static struct miscdevice vhost_blk_misc vhost_blk_fops, }; -int vhost_blk_init(void) +static int vhost_blk_init(void) { int r = vhost_init(); if (r) @@ -216,7 +216,7 @@ err_init: } module_init(vhost_blk_init); -void vhost_blk_exit(void) +static void vhost_blk_exit(void) { misc_deregister(vhost_blk_misc); vhost_cleanup(); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Thu, Mar 25, 2010 at 04:00:56PM +0100, Asdo wrote: Would the loop device provide the features of a block device? I recall barrier support at least has been added recently. It does, but not in a very efficient way. Is it recommended to run kvm on a loopback mounted file compared to on a raw file? No. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Mon, 2010-04-05 at 15:23 -0400, Christoph Hellwig wrote: On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote: iovecs and buffers are user-space pointers (from the host kernel point of view). They are guest address. So, I don't need to do any set_fs tricks. From verifying the code and using the sparse annotations it appears that the actual buffers are userspace pointers, but the iovecs in the virtqueue are kernel level pointers, so you would need some annotations. Yes. Thats correct. I will add appropriate annotations. While we're at it here is a patch fixing the remaining sparse warnings in vhost-blk: Applied. Thanks, Badari Signed-off-by: Christoph Hellwig h...@lst.de Index: linux-2.6/drivers/vhost/blk.c === --- linux-2.6.orig/drivers/vhost/blk.c2010-04-05 21:15:11.638004250 +0200 +++ linux-2.6/drivers/vhost/blk.c 2010-04-05 21:16:13.238004599 +0200 @@ -86,7 +86,7 @@ static void handle_blk(struct vhost_blk nvecs++; BUG_ON(vq-iov[nvecs].iov_len != 1); - if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof status) 0) { + if (copy_to_user(vq-iov[nvecs].iov_base, status, sizeof status)) { printk(copy to user failed\n); vhost_discard_vq_desc(vq); break; @@ -199,7 +199,7 @@ static struct miscdevice vhost_blk_misc vhost_blk_fops, }; -int vhost_blk_init(void) +static int vhost_blk_init(void) { int r = vhost_init(); if (r) @@ -216,7 +216,7 @@ err_init: } module_init(vhost_blk_init); -void vhost_blk_exit(void) +static void vhost_blk_exit(void) { misc_deregister(vhost_blk_misc); vhost_cleanup(); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Mon, 2010-04-05 at 15:22 +0100, Stefan Hajnoczi wrote: On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty pbad...@us.ibm.com wrote: +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + int i, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio-blk; + vq = blk-dev.vqs[0]; + pos = vbio-sector 8; + + use_mm(blk-dev.mm); + + if (vbio-type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1); + } else if (vbio-type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos); + } else { + ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos); + } + + status = (ret 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof status) 0) { + printk(copy to user failed\n); + vhost_discard_vq_desc(vq); + unuse_mm(blk-dev.mm); + return; Do you need to kfree(vbio) here? Yes. I do. As mentioned earlier, I haven't fixed error handling yet :( +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd) +{ + struct file *file; + struct vhost_virtqueue *vq; + + file = fget(fd); + if (!file) + return -EBADF; + + vq = n-vqs + index; + mutex_lock(vq-mutex); + rcu_assign_pointer(vq-private_data, file); + mutex_unlock(vq-mutex); + return 0; +} + + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, +unsigned long arg) +{ + struct vhost_blk *n = f-private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + int r; + + switch (ioctl) { +case VHOST_NET_SET_BACKEND: + r = copy_from_user(backend, argp, sizeof backend); + if (r 0) + return r; + return vhost_blk_set_backend(n, backend.index, backend.fd); I don't see backend.index being checked against VHOST_BLK_VQ_MAX. Yep. You are right. I will add these checks for my next revision. Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/30/2010 01:51 AM, Badari Pulavarty wrote: Your io wait time is twice as long and your throughput is about half. I think the qmeu block submission does an extra attempt at merging requests. Does blktrace tell you anything interesting? Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K requests from the virtio ring and merging them back to 2M before submitting them. Unfortunately, I can't do that quite easily in vhost-blk. QEMU does re-creates iovecs for the merged IO. I have to come up with a scheme to do this :( I don't think that either vhost-blk or virtio-blk should do this. Merging increases latency, and in the case of streaming writes makes it impossible for the guest to prepare new requests while earlier ones are being serviced (in effect it reduces the queue depth to 1). qcow2 does benefit from merging, but it should do so itself without impacting raw. It does. I suggest using fio O_DIRECT random access patterns to avoid such issues. Well, I am not trying to come up with a test case where vhost-blk performs better than virtio-blk. I am trying to understand where and why vhost-blk performnce worse than virtio-blk. In this case qemu-virtio is making an incorrect tradeoff. The guest could easily merge those requests itself. If you want larger writes, tune the guest to issue them. Another way to look at it: merging improved bandwidth but increased latency, yet you are only measuring bandwidth. If you measured only latency you'd find that vhost-blk is better. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Hi Christoph, I am wondering if you can provide your thoughts here.. I modified my vhost-blk implementation to offload work to work_queues instead of doing synchronously. Infact, I tried to spread the work across all the CPUs. But to my surprise, this did not improve the performance compared to virtio-blk. I see vhost-blk taking more interrupts and context switches compared to virtio-blk. What is virtio-blk doing which I am not able to from vhost-blk ??? Thanks, Badari vhost-blk procs ---memory-- ---swap-- -io --system-- -cpu- r b swpd free buff cache si sobibo in cs us sy id wa st 3 1 8920 56076 20760 56035560 104 196 79826 17164 13912 0 5 65 30 0 2 4 9488 57216 20744 56056160 114 195 81120 17397 13824 0 5 65 30 0 2 2 10028 68476 20728 55947640 108 206 80318 17162 13845 0 5 65 30 0 0 4 10560 70856 20708 55930880 106 205 82363 17402 13904 0 5 65 30 0 1 3 10948 80380 20672 55844520 78 178 79714 17113 13875 0 5 66 29 0 qemu virtio-blk: procs ---memory-- ---swap-- -io --system-- -cpu- r b swpd free buff cache si sobibo in cs us sy id wa st 0 1 14124 57456 5144 492406000 139 142546 11287 9312 1 4 80 15 0 0 2 14124 56736 5148 492739600 146 142968 11283 9248 1 4 80 15 0 0 1 14124 56712 5384 49270200074 150738 11182 9327 1 4 80 16 0 1 1 14124 55496 5392 492790400 2 159902 11172 9401 1 3 79 17 0 0 1 14124 55968 5408 492723200 0 159202 11212 9325 1 3 80 16 0 --- drivers/vhost/blk.c | 310 1 file changed, 310 insertions(+) Index: net-next/drivers/vhost/blk.c === --- /dev/null 1970-01-01 00:00:00.0 + +++ net-next/drivers/vhost/blk.c2010-03-25 20:06:57.484054770 -0400 @@ -0,0 +1,310 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include linux/compat.h +#include linux/eventfd.h +#include linux/vhost.h +#include linux/virtio_net.h +#include linux/virtio_blk.h +#include linux/mmu_context.h +#include linux/miscdevice.h +#include linux/module.h +#include linux/mutex.h +#include linux/workqueue.h +#include linux/rcupdate.h +#include linux/file.h + +#include vhost.h + +#define VHOST_BLK_VQ_MAX 1 + +#if 0 +#define myprintk(fmt, ...) printk(pr_fmt(fmt), ##__VA_ARGS__) +#else +#define myprintk(fmt, ...) +#endif + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +struct vhost_blk_io { + struct work_struct work; + struct vhost_blk *blk; + struct file *file; + int head; + uint32_t type; + uint64_t sector; + struct iovec *iov; + int nvecs; +}; + +static struct workqueue_struct *vblk_workqueue; + +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + int i, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio-blk; + vq = blk-dev.vqs[0]; + pos = vbio-sector 8; + + use_mm(blk-dev.mm); + + if (vbio-type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1); + } else if (vbio-type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos); + } else { + ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos); + } + + status = (ret 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof status) 0) { + printk(copy to user failed\n); + vhost_discard_vq_desc(vq); + unuse_mm(blk-dev.mm); + return; + } + mutex_lock(vq-mutex); + vhost_add_used_and_signal(blk-dev, vq, vbio-head, ret); + mutex_unlock(vq-mutex); + unuse_mm(blk-dev.mm); + kfree(vbio); +} + +static int cpu = 0; +static int handoff_io(struct vhost_blk *blk, int head, + uint32_t type, uint64_t sector, + struct iovec *iov, int nvecs) +{ + struct vhost_virtqueue *vq = blk-dev.vqs[0]; + struct vhost_blk_io *vbio; + + vbio = kmalloc(sizeof(struct vhost_blk_io), GFP_KERNEL); + if (!vbio) + return -ENOMEM; + + INIT_WORK(vbio-work, handle_io_work); + vbio-blk = blk; + vbio-file = vq-private_data; + vbio-head = head; + vbio-type = type; + vbio-sector = sector; + vbio-iov = iov; + vbio-nvecs = nvecs; + +
Re: [RFC] vhost-blk implementation
* Badari Pulavarty (pbad...@us.ibm.com) wrote: I modified my vhost-blk implementation to offload work to work_queues instead of doing synchronously. Infact, I tried to spread the work across all the CPUs. But to my surprise, this did not improve the performance compared to virtio-blk. I see vhost-blk taking more interrupts and context switches compared to virtio-blk. What is virtio-blk doing which I am not able to from vhost-blk ??? Your io wait time is twice as long and your throughput is about half. I think the qmeu block submission does an extra attempt at merging requests. Does blktrace tell you anything interesting? procs ---memory-- ---swap-- -io --system-- -cpu- r b swpd free buff cache si sobibo in cs us sy id wa st 3 1 8920 56076 20760 56035560 104 196 79826 17164 13912 0 5 65 30 0 2 4 9488 57216 20744 56056160 114 195 81120 17397 13824 0 5 65 30 0 2 2 10028 68476 20728 55947640 108 206 80318 17162 13845 0 5 65 30 0 0 4 10560 70856 20708 55930880 106 205 82363 17402 13904 0 5 65 30 0 1 3 10948 80380 20672 55844520 78 178 79714 17113 13875 0 5 66 29 0 qemu virtio-blk: procs ---memory-- ---swap-- -io --system-- -cpu- r b swpd free buff cache si sobibo in cs us sy id wa st 0 1 14124 57456 5144 492406000 139 142546 11287 9312 1 4 80 15 0 0 2 14124 56736 5148 492739600 146 142968 11283 9248 1 4 80 15 0 0 1 14124 56712 5384 49270200074 150738 11182 9327 1 4 80 16 0 1 1 14124 55496 5392 492790400 2 159902 11172 9401 1 3 79 17 0 0 1 14124 55968 5408 492723200 0 159202 11212 9325 1 3 80 16 0 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/29/2010 09:20 PM, Chris Wright wrote: * Badari Pulavarty (pbad...@us.ibm.com) wrote: I modified my vhost-blk implementation to offload work to work_queues instead of doing synchronously. Infact, I tried to spread the work across all the CPUs. But to my surprise, this did not improve the performance compared to virtio-blk. I see vhost-blk taking more interrupts and context switches compared to virtio-blk. What is virtio-blk doing which I am not able to from vhost-blk ??? Your io wait time is twice as long and your throughput is about half. I think the qmeu block submission does an extra attempt at merging requests. Does blktrace tell you anything interesting? It does. I suggest using fio O_DIRECT random access patterns to avoid such issues. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Mon, 2010-03-29 at 23:37 +0300, Avi Kivity wrote: On 03/29/2010 09:20 PM, Chris Wright wrote: * Badari Pulavarty (pbad...@us.ibm.com) wrote: I modified my vhost-blk implementation to offload work to work_queues instead of doing synchronously. Infact, I tried to spread the work across all the CPUs. But to my surprise, this did not improve the performance compared to virtio-blk. I see vhost-blk taking more interrupts and context switches compared to virtio-blk. What is virtio-blk doing which I am not able to from vhost-blk ??? Your io wait time is twice as long and your throughput is about half. I think the qmeu block submission does an extra attempt at merging requests. Does blktrace tell you anything interesting? Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K requests from the virtio ring and merging them back to 2M before submitting them. Unfortunately, I can't do that quite easily in vhost-blk. QEMU does re-creates iovecs for the merged IO. I have to come up with a scheme to do this :( It does. I suggest using fio O_DIRECT random access patterns to avoid such issues. Well, I am not trying to come up with a test case where vhost-blk performs better than virtio-blk. I am trying to understand where and why vhost-blk performnce worse than virtio-blk. Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
* Badari Pulavarty (pbad...@us.ibm.com) wrote: On Mon, 2010-03-29 at 23:37 +0300, Avi Kivity wrote: On 03/29/2010 09:20 PM, Chris Wright wrote: Your io wait time is twice as long and your throughput is about half. I think the qmeu block submission does an extra attempt at merging requests. Does blktrace tell you anything interesting? Yes. I see that in my testcase (2M writes) - QEMU is pickup 512K requests from the virtio ring and merging them back to 2M before submitting them. Unfortunately, I can't do that quite easily in vhost-blk. QEMU does re-creates iovecs for the merged IO. I have to come up with a scheme to do this :( Is close cooperator logic kicking in at all? Alternatively, using same io_context. It does. I suggest using fio O_DIRECT random access patterns to avoid such issues. Well, I am not trying to come up with a test case where vhost-blk performs better than virtio-blk. I am trying to understand where and why vhost-blk performnce worse than virtio-blk. It would just level the playing field. Alternatively, commenting out merging in qemu to further validate it's the source, something as simple as this in block.c: -num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); +//num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); Although, from above, sounds like you've already verified this is the difference. IIRC, the sync write path is at odds w/ cfq elevator merging (so cache=none would have trouble, but you were doing cache=writeback, right?), but if you can hack your worker threads to share an io_context, like you had done CLONE_IO, you might get some merging back (again, just a hack to pinpoint merging as the culprit). thanks, -chris -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Christoph Hellwig hch at infradead.org writes: Ok. cache=writeback performance is something I haven't bothered looking at at all. For cache=none any streaming write or random workload with large enough record sizes got basically the same performance as native using kernel aio, and same for write but slightly degraded for reads using the thread pool. See my attached JLS presentation for some numbers. Looks like the presentation did not make it... Thanks, Eran -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/24/2010 10:05 PM, Christoph Hellwig wrote: On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote: I also think it should be done at the bio layer. File I/O is going to be slower, if we do vhost-blk we should concentrate on maximum performance. The block layer also exposes more functionality we can use (asynchronous barriers for example). The block layer is more flexible, but that limits you to only stack directly ontop of a block device, which is extremly inflexible. We still have a virtio implementation in userspace for file-based images. In any case, the file APIs are not asynchronous so we'll need a thread pool. That will probably minimize the difference in performance between the userspace and kernel implementations. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/24/2010 10:22 PM, Badari Pulavarty wrote: Which caching mode is this? I assume data=writeback, because otherwise you'd be doing synchronous I/O directly from the handler. Yes. This is with default (writeback) cache model. As mentioned earlier, readhead is helping here and most cases, data would be ready in the pagecache. btw, relying on readahead is problematic. The guest already issues readahead requests, and the host will issue readahead based on that readahead, reading far into the future. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Avi Kivity wrote: On 03/24/2010 10:22 PM, Badari Pulavarty wrote: Which caching mode is this? I assume data=writeback, because otherwise you'd be doing synchronous I/O directly from the handler. Yes. This is with default (writeback) cache model. As mentioned earlier, readhead is helping here and most cases, data would be ready in the pagecache. btw, relying on readahead is problematic. The guest already issues readahead requests, and the host will issue readahead based on that readahead, reading far into the future. Sure. In my tests, I did O_DIRECT in the guest - so there is no readahead in the guest. Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Avi Kivity wrote: For the case where the file is actually a partition, use submit_bio(). When the file is a file, keep it in qemu, that path is going to be slower anyway. [CUT] Christoph Hellwig wrote: On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote: I also think it should be done at the bio layer. File I/O is going to be slower, if we do vhost-blk we should concentrate on maximum performance. The block layer also exposes more functionality we can use (asynchronous barriers for example). The block layer is more flexible, but that limits you to only stack directly ontop of a block device, which is extremly inflexible. Would the loop device provide the features of a block device? I recall barrier support at least has been added recently. Is it recommended to run kvm on a loopback mounted file compared to on a raw file? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Thu, Mar 25, 2010 at 08:29:03AM +0200, Avi Kivity wrote: We still have a virtio implementation in userspace for file-based images. In any case, the file APIs are not asynchronous so we'll need a thread pool. That will probably minimize the difference in performance between the userspace and kernel implementations. The kernel has real aio when doing direct I/O, although currently we can't easily use it from kernelspace. Thejre's a few people who've been looking into making it usable for that, though. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/25/2010 05:48 PM, Christoph Hellwig wrote: On Thu, Mar 25, 2010 at 08:29:03AM +0200, Avi Kivity wrote: We still have a virtio implementation in userspace for file-based images. In any case, the file APIs are not asynchronous so we'll need a thread pool. That will probably minimize the difference in performance between the userspace and kernel implementations. The kernel has real aio when doing direct I/O, although currently we can't easily use it from kernelspace. Thejre's a few people who've been looking into making it usable for that, though. Interesting. Are there limitations on supported filesystems, or allocating vs. non-allocating writes? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Wed, Mar 24, 2010 at 01:22:37PM -0700, Badari Pulavarty wrote: Yes. This is with default (writeback) cache model. As mentioned earlier, readhead is helping here and most cases, data would be ready in the pagecache. Ok. cache=writeback performance is something I haven't bothered looking at at all. For cache=none any streaming write or random workload with large enough record sizes got basically the same performance as native using kernel aio, and same for write but slightly degraded for reads using the thread pool. See my attached JLS presentation for some numbers. iovecs and buffers are user-space pointers (from the host kernel point of view). They are guest address. So, I don't need to do any set_fs tricks. Right now they're not ceclared as such, so sparse would complain. Yes. QEMU virtio-blk is batching up all the writes and handing of the work to another thread. If using the thread pool. If using kernel aio it performs the io_submit system call directly. What do should I do here ? I can create bunch of kernel threads to do the IO for me. Or some how fit and reuse AIO io_submit() mechanism. Whats the best way here ? I hate do duplicate all the code VFS is doing. The only thing you can do currently is adding a thread pool. It might be able to use io_submit for the O_DIRECT case with some refactoring. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. Can you also send the qemu side of it? with vhost-blk: # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s real2m6.137s user0m0.281s sys 0m14.725s without vhost-blk: (virtio) --- # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s real4m35.468s user0m0.373s sys 0m48.074s Which caching mode is this? I assume data=writeback, because otherwise you'd be doing synchronous I/O directly from the handler. +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector, + struct iovec *iov, int in) +{ + loff_t pos = sector 8; + int ret = 0; + + if (type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(file, file-f_path.dentry, 1); + } else if (type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(file, iov, in, pos); + } else { + ret = vfs_readv(file, iov, in, pos); + } + return ret; I have to admit I don't understand the vhost architecture at all, but where do the actual data pointers used by the iovecs reside? vfs_readv/writev expect both the iovec itself and the buffers pointed to by it to reside in userspace, so just using kernel buffers here will break badly on architectures with different user/kernel mappings. A lot of this is fixable using simple set_fs co tricks, but for direct I/O which uses get_user_pages even that will fail badly. Also it seems like you're doing all the I/O synchronous here? For data=writeback operations that could explain the read speedup as you're avoiding context switches, but for actual write I/O which has to get data to disk (either directly from vfs_writev or later through vfs_fsync) this seems like a really bad idea stealing a lot of guest time that should happen in the background. Other than that the code seems quite nice and simple, but one huge problem is that it'll only support raw images, and thus misses out on all the nice image formats used in qemu deployments, especially qcow2. It's also missing the ioctl magic we're having in various places, both for controlling host devices like cdroms and SG passthrough. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On Tue, Mar 23, 2010 at 12:03:14PM +0200, Avi Kivity wrote: I also think it should be done at the bio layer. File I/O is going to be slower, if we do vhost-blk we should concentrate on maximum performance. The block layer also exposes more functionality we can use (asynchronous barriers for example). The block layer is more flexible, but that limits you to only stack directly ontop of a block device, which is extremly inflexible. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Christoph Hellwig wrote: Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. Can you also send the qemu side of it? with vhost-blk: # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s real2m6.137s user0m0.281s sys 0m14.725s without vhost-blk: (virtio) --- # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s real4m35.468s user0m0.373s sys 0m48.074s Which caching mode is this? I assume data=writeback, because otherwise you'd be doing synchronous I/O directly from the handler. Yes. This is with default (writeback) cache model. As mentioned earlier, readhead is helping here and most cases, data would be ready in the pagecache. +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector, + struct iovec *iov, int in) +{ + loff_t pos = sector 8; + int ret = 0; + + if (type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(file, file-f_path.dentry, 1); + } else if (type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(file, iov, in, pos); + } else { + ret = vfs_readv(file, iov, in, pos); + } + return ret; I have to admit I don't understand the vhost architecture at all, but where do the actual data pointers used by the iovecs reside? vfs_readv/writev expect both the iovec itself and the buffers pointed to by it to reside in userspace, so just using kernel buffers here will break badly on architectures with different user/kernel mappings. A lot of this is fixable using simple set_fs co tricks, but for direct I/O which uses get_user_pages even that will fail badly. iovecs and buffers are user-space pointers (from the host kernel point of view). They are guest address. So, I don't need to do any set_fs tricks. Also it seems like you're doing all the I/O synchronous here? For data=writeback operations that could explain the read speedup as you're avoiding context switches, but for actual write I/O which has to get data to disk (either directly from vfs_writev or later through vfs_fsync) this seems like a really bad idea stealing a lot of guest time that should happen in the background. Yes. QEMU virtio-blk is batching up all the writes and handing of the work to another thread. When the writes() are complete, its sending a status completion. Since I am doing everything synchronous (even though its write to pagecache) one request at a time, that explains the slow down. We need to find a way to 1) batch IO writes together 2) hand off to another thread to do the IO, so that vhost-thread can handle next set of requests 3) update the status on the completion What do should I do here ? I can create bunch of kernel threads to do the IO for me. Or some how fit and reuse AIO io_submit() mechanism. Whats the best way here ? I hate do duplicate all the code VFS is doing. Other than that the code seems quite nice and simple, but one huge problem is that it'll only support raw images, and thus misses out on all the nice image formats used in qemu deployments, especially qcow2. It's also missing the ioctl magic we're having in various places, both for controlling host devices like cdroms and SG passthrough. True... unfortunately, I don't understand all of those (qcow2) details yet !! I need to read up on those, to even make a comment :( Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Christoph Hellwig wrote: Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. Can you also send the qemu side of it? Its pretty hacky and based it on old patch (vhost-net) from MST for simplicity. I haven't focused on cleaning it up and I will re-base it on MST's latest code once it gets into QEMU. Thanks, Badari --- hw/virtio-blk.c | 199 1 file changed, 199 insertions(+) Index: vhost/hw/virtio-blk.c === --- vhost.orig/hw/virtio-blk.c 2010-02-25 16:47:04.0 -0500 +++ vhost/hw/virtio-blk.c 2010-03-17 14:07:26.477430740 -0400 @@ -18,6 +18,7 @@ #ifdef __linux__ # include scsi/sg.h #endif +#include kvm.h typedef struct VirtIOBlock { @@ -28,8 +29,13 @@ char serial_str[BLOCK_SERIAL_STRLEN + 1]; QEMUBH *bh; size_t config_size; +uint8_t vhost_started; } VirtIOBlock; +typedef struct BDRVRawState { +int fd; +} BDRVRawState; + static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev) { return (VirtIOBlock *)vdev; @@ -501,6 +507,198 @@ return 0; } +#if 1 +#include linux/vhost.h +#include sys/ioctl.h +#include sys/eventfd.h +#include vhost.h + +int vhost_blk_fd; + +struct slot_info { +unsigned long phys_addr; +unsigned long len; +unsigned long userspace_addr; +unsigned flags; +int logging_count; +}; + +extern struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS]; + +static int vhost_blk_start(struct VirtIODevice *vdev) +{ + target_phys_addr_t s, l, a; + int r, num, idx = 0; + struct vhost_vring_state state; + struct vhost_vring_file file; + struct vhost_vring_addr addr; + unsigned long long used_phys; + void *desc, *avail, *used; + int i, n =0; + struct VirtQueue *q = virtio_queue(vdev, idx); + VirtIOBlock *vb = to_virtio_blk(vdev); + struct vhost_memory *mem; + BDRVRawState *st = vb-bs-opaque; + + vhost_blk_fd = open(/dev/vhost-blk, O_RDWR); + if (vhost_blk_fd 0) { + fprintf(stderr, unable to open vhost-blk\n); + return -errno; + } + + r = ioctl(vhost_blk_fd, VHOST_SET_OWNER, NULL); +if (r 0) { + fprintf(stderr, ioctl VHOST_SET_OWNER failed\n); +return -errno; + } + +for (i = 0; i KVM_MAX_NUM_MEM_REGIONS; ++i) { +if (!slots[i].len || + (slots[i].flags KVM_MEM_LOG_DIRTY_PAGES)) { + continue; +} +++n; +} + +mem = qemu_mallocz(offsetof(struct vhost_memory, regions) + + n * sizeof(struct vhost_memory_region)); +if (!mem) +return -ENOMEM; + +mem-nregions = n; +n = 0; +for (i = 0; i KVM_MAX_NUM_MEM_REGIONS; ++i) { +if (!slots[i].len || (slots[i].flags + KVM_MEM_LOG_DIRTY_PAGES)) { +continue; +} +mem-regions[n].guest_phys_addr = slots[i].phys_addr; +mem-regions[n].memory_size = slots[i].len; +mem-regions[n].userspace_addr = slots[i].userspace_addr; +++n; +} + +r = ioctl(vhost_blk_fd, VHOST_SET_MEM_TABLE, mem); +if (r 0) +return -errno; + + state.index = idx; + num = state.num = virtio_queue_get_num(vdev, idx); + r = ioctl(vhost_blk_fd, VHOST_SET_VRING_NUM, state); +if (r) { + fprintf(stderr, ioctl VHOST_SET_VRING_NUM failed\n); +return -errno; +} + + state.num = virtio_queue_last_avail_idx(vdev, idx); + r = ioctl(vhost_blk_fd, VHOST_SET_VRING_BASE, state); + if (r) { + fprintf(stderr, ioctl VHOST_SET_VRING_BASE failed\n); +return -errno; + } + + s = l = sizeof(struct vring_desc) * num; + a = virtio_queue_get_desc(vdev, idx); + desc = cpu_physical_memory_map(a, l, 0); + if (!desc || l != s) { +r = -ENOMEM; +goto fail_alloc; + } + s = l = offsetof(struct vring_avail, ring) + +sizeof(u_int64_t) * num; +a = virtio_queue_get_avail(vdev, idx); +avail = cpu_physical_memory_map(a, l, 0); +if (!avail || l != s) { +r = -ENOMEM; +goto fail_alloc; +} +s = l = offsetof(struct vring_used, ring) + +sizeof(struct vring_used_elem) * num; +used_phys = a = virtio_queue_get_used(vdev, idx); +used =
Re: [RFC] vhost-blk implementation
On 03/23/2010 03:00 AM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier [RFC] vhost-blk implementation.eml Subject: [RFC] vhost-blk implementation From: Badari Pulavarty pbad...@us.ibm.com Date: Mon, 22 Mar 2010 17:34:06 -0700 To: virtualizat...@lists.linux-foundation.org, qemu-de...@nongnu.org Hi, Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. NOTE: Only change I had to make to vhost core code is to increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h Performance: = I have done simple tests to see how it performs. I got very encouraging results on sequential read tests. But on sequential write tests, I see degrade over virtio-blk. I can't figure out and explain why. Can some one shed light on whats happening here ? Read Results: = Test does read of 84GB file from the host (through virtio). I unmount and mount the filesystem on the host to make sure there is nothing in the page cache.. +#define VHOST_BLK_VQ_MAX 1 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector, + struct iovec *iov, int in) +{ + loff_t pos = sector 8; + int ret = 0; + + if (type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(file, file-f_path.dentry, 1); + } else if (type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(file, iov, in,pos); + } else { + ret = vfs_readv(file, iov, in,pos); + } + return ret; +} This should be done asynchronously. That is likely the cause of write performance degradation. For reads, readahead means that that you're async anyway, but writes/syncs are still synchronous. I also think it should be done at the bio layer. File I/O is going to be slower, if we do vhost-blk we should concentrate on maximum performance. The block layer also exposes more functionality we can use (asynchronous barriers for example). btw, for fairness, cpu measurements should be done from the host side and include the vhost thread. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/23/2010 04:50 AM, Badari Pulavarty wrote: Anthony Liguori wrote: On 03/22/2010 08:45 PM, Badari Pulavarty wrote: Anthony Liguori wrote: On 03/22/2010 08:00 PM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier These virtio results are still with a 2.6.18 kernel with no aio, right? Results are on 2.6.33-rc8-net-next kernel. But not using AIO. Can you try with aio and cache=none? aio-native,cache=none # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 470.588 seconds, 170 MB/s You're effectively killing readahead with this. Please use fio with a sequential pattern, queue depth 1. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
with vhost-blk: # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct Can you please check both read write with cache=off, bs=128k Thanks very much, Eran -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Avi Kivity wrote: On 03/23/2010 04:50 AM, Badari Pulavarty wrote: Anthony Liguori wrote: On 03/22/2010 08:45 PM, Badari Pulavarty wrote: Anthony Liguori wrote: On 03/22/2010 08:00 PM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier These virtio results are still with a 2.6.18 kernel with no aio, right? Results are on 2.6.33-rc8-net-next kernel. But not using AIO. Can you try with aio and cache=none? aio-native,cache=none # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 470.588 seconds, 170 MB/s You're effectively killing readahead with this. Yes. cache=none, aio=native will force it hit the disk all the time. We loose all the benefits of readahead. Please use fio with a sequential pattern, queue depth 1. Will do. Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Avi Kivity wrote: On 03/23/2010 03:00 AM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier [RFC] vhost-blk implementation.eml Subject: [RFC] vhost-blk implementation From: Badari Pulavarty pbad...@us.ibm.com Date: Mon, 22 Mar 2010 17:34:06 -0700 To: virtualizat...@lists.linux-foundation.org, qemu-de...@nongnu.org Hi, Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. NOTE: Only change I had to make to vhost core code is to increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h Performance: = I have done simple tests to see how it performs. I got very encouraging results on sequential read tests. But on sequential write tests, I see degrade over virtio-blk. I can't figure out and explain why. Can some one shed light on whats happening here ? Read Results: = Test does read of 84GB file from the host (through virtio). I unmount and mount the filesystem on the host to make sure there is nothing in the page cache.. +#define VHOST_BLK_VQ_MAX 1 + +struct vhost_blk { +struct vhost_dev dev; +struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; +struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector, +struct iovec *iov, int in) +{ +loff_t pos = sector 8; +int ret = 0; + +if (type VIRTIO_BLK_T_FLUSH) { +ret = vfs_fsync(file, file-f_path.dentry, 1); +} else if (type VIRTIO_BLK_T_OUT) { +ret = vfs_writev(file, iov, in,pos); +} else { +ret = vfs_readv(file, iov, in,pos); +} +return ret; +} This should be done asynchronously. That is likely the cause of write performance degradation. For reads, readahead means that that you're async anyway, but writes/syncs are still synchronous. I am not sure what you mean by async here. Even if I use f_op-aio_write() its still synchronous (except for DIO). Since we are writing to pagecache and not waiting for write() to complete, this is the best we can do here. Do you mean offload write() handling to another thread ? I also think it should be done at the bio layer. I am not sure what you meant here. Do you want to do submit_bio() directly ? Its not going to be that simple. Since the sector# is offset within the file, one have to do getblocks() on it to find the real-disk-block#s + we have to do get_user_pages() on these iovecs before submitting them to bio.. All of this work is done by vfs_write()/vfs_read() anyway.. I am not sure what you are suggesting here.. File I/O is going to be slower, if we do vhost-blk we should concentrate on maximum performance. The block layer also exposes more functionality we can use (asynchronous barriers for example). btw, for fairness, cpu measurements should be done from the host side and include the vhost thread. Will do. Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/23/2010 04:55 PM, Badari Pulavarty wrote: This should be done asynchronously. That is likely the cause of write performance degradation. For reads, readahead means that that you're async anyway, but writes/syncs are still synchronous. I am not sure what you mean by async here. Even if I use f_op-aio_write() its still synchronous (except for DIO). Since we are writing to pagecache and not waiting for write() to complete, this is the best we can do here. It fails with O_DIRECT or O_DSYNC, or if the write cache is full and we need to wait on writeout. Reads that don't hit pagecache will also be synchronous. You could use threads (like qemu) or use the block layer (which is async). Do you mean offload write() handling to another thread ? Yeah. Read too. I also think it should be done at the bio layer. I am not sure what you meant here. Do you want to do submit_bio() directly ? Its not going to be that simple. Since the sector# is offset within the file, one have to do getblocks() on it to find the real-disk-block#s + we have to do get_user_pages() on these iovecs before submitting them to bio.. All of this work is done by vfs_write()/vfs_read() anyway.. I am not sure what you are suggesting here.. For the case where the file is actually a partition, use submit_bio(). When the file is a file, keep it in qemu, that path is going to be slower anyway. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC] vhost-blk implementation
Forgot to CC: KVM list earlier ---BeginMessage--- Hi, Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. NOTE: Only change I had to make to vhost core code is to increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h Performance: = I have done simple tests to see how it performs. I got very encouraging results on sequential read tests. But on sequential write tests, I see degrade over virtio-blk. I can't figure out and explain why. Can some one shed light on whats happening here ? Read Results: = Test does read of 84GB file from the host (through virtio). I unmount and mount the filesystem on the host to make sure there is nothing in the page cache.. with vhost-blk: # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s real2m6.137s user0m0.281s sys 0m14.725s without vhost-blk: (virtio) --- # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s real4m35.468s user0m0.373s sys 0m48.074s Write Results: == I see degraded IO performance when doing sequential IO write tests with vhost-blk compared to virtio-blk. # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with vhost-blk. Wondering why ? Comments/flames ? Thanks, Badari vhost-blk is in-kernel accelerator for virtio-blk. At this time, this is a prototype based on virtio-net. Lots of error handling and clean up needs to be done. Read performance is pretty good over QEMU virtio-blk, but write performance is not anywhere close to QEMU virtio-blk. Why ? Signed-off-by: Badari Pulavarty pbad...@us.ibm.com --- drivers/vhost/blk.c | 242 1 file changed, 242 insertions(+) Index: net-next/drivers/vhost/blk.c === --- /dev/null 1970-01-01 00:00:00.0 + +++ net-next/drivers/vhost/blk.c2010-03-22 18:07:18.156584400 -0400 @@ -0,0 +1,242 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include linux/compat.h +#include linux/eventfd.h +#include linux/vhost.h +#include linux/virtio_net.h +#include linux/virtio_blk.h +#include linux/mmu_context.h +#include linux/miscdevice.h +#include linux/module.h +#include linux/mutex.h +#include linux/workqueue.h +#include linux/rcupdate.h +#include linux/file.h + +#include vhost.h + +#define VHOST_BLK_VQ_MAX 1 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector, + struct iovec *iov, int in) +{ + loff_t pos = sector 8; + int ret = 0; + + if (type VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(file, file-f_path.dentry, 1); + } else if (type VIRTIO_BLK_T_OUT) { + ret = vfs_writev(file, iov, in, pos); + } else { + ret = vfs_readv(file, iov, in, pos); + } + return ret; +} + +static void handle_blk(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq = blk-dev.vqs[0]; + unsigned head, out, in; + struct virtio_blk_outhdr hdr; + int r, nvecs; + uint8_t status = 0; + + use_mm(blk-dev.mm); + mutex_lock(vq-mutex); + + vhost_disable_notify(vq); + + for (;;) { + head = vhost_get_vq_desc(blk-dev, vq, vq-iov, +ARRAY_SIZE(vq-iov), +out, in, NULL, NULL); + if (head == vq-num) { + if (unlikely(vhost_enable_notify(vq))) { + vhost_disable_notify(vq); + continue; + } + break; + } + + BUG_ON(vq-iov[0].iov_len != 16); + + r = copy_from_user(hdr, vq-iov[0].iov_base, sizeof hdr); + if (r 0) { + printk(copy from user failed\n); + vhost_discard_vq_desc(vq); + break; + } + + nvecs = out - 1; + if (hdr.type == VIRTIO_BLK_T_IN) + nvecs = in - 1; + + r = do_handle_io(vq-private_data, hdr.type, hdr.sector, vq-iov[1], nvecs); +
Re: [RFC] vhost-blk implementation
On 03/22/2010 08:00 PM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier These virtio results are still with a 2.6.18 kernel with no aio, right? Regards, Anthony LIguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Anthony Liguori wrote: On 03/22/2010 08:00 PM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier These virtio results are still with a 2.6.18 kernel with no aio, right? Results are on 2.6.33-rc8-net-next kernel. But not using AIO. Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
On 03/22/2010 08:45 PM, Badari Pulavarty wrote: Anthony Liguori wrote: On 03/22/2010 08:00 PM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier These virtio results are still with a 2.6.18 kernel with no aio, right? Results are on 2.6.33-rc8-net-next kernel. But not using AIO. Can you try with aio and cache=none? Regards, Anthony Liguori Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] vhost-blk implementation
Anthony Liguori wrote: On 03/22/2010 08:45 PM, Badari Pulavarty wrote: Anthony Liguori wrote: On 03/22/2010 08:00 PM, Badari Pulavarty wrote: Forgot to CC: KVM list earlier These virtio results are still with a 2.6.18 kernel with no aio, right? Results are on 2.6.33-rc8-net-next kernel. But not using AIO. Can you try with aio and cache=none? aio-native,cache=none # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 470.588 seconds, 170 MB/s Thanks, Badari -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html