Although QEMU virtio is quite fast, there is still some room for
improvements. Disk latency can be reduced if we handle virito-blk
requests
in host kernel istead of passing them to QEMU. The patch adds vhost-blk
kernel module to do so.
Some test setups:
fio --direct=1 --rw=randread --bs=4k --ioengine=libaio --iodepth=128
QEMU drive options: cache=none
filesystem: xfs
SSD:
| randread, IOPS | randwrite, IOPS |
Host | 95.8k | 85.3k |
QEMU virtio | 57.5k | 79.4k |
QEMU vhost-blk | 95.6k | 84.3k |
RAMDISK (vq == vcpu):
| randread, IOPS | randwrite, IOPS |
virtio, 1vcpu | 123k | 129k |
virtio, 2vcpu | 253k (??) | 250k (??) |
virtio, 4vcpu | 158k | 154k |
vhost-blk, 1vcpu | 110k | 113k |
vhost-blk, 2vcpu | 247k | 252k |
vhost-blk, 4vcpu | 576k | 567k |
https://jira.sw.ru/browse/PSBM-139414
Signed-off-by: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com>
---
v2:
- removed unused VHOST_BLK_VQ
- reworked bio handling a bit: now add all pages from signle iov into
single bio istead of allocating one bio per page
- changed how to calculate sector incrementation
- check move_iovec() in vhost_blk_req_handle()
- remove snprintf check and better check ret from copy_to_iter for
VIRTIO_BLK_ID_BYTES requests
- discard vq request if vhost_blk_req_handle() returned negative code
- forbid to change nonzero backend in vhost_blk_set_backend(). First of
all, QEMU sets backend only once. Also if we want to change backend when
we already running requests we need to be much more careful in
vhost_blk_handle_guest_kick() as it is not taking any references. If
userspace want to change backend that bad it can always reset device.
- removed EXPERIMENTAL from Kconfig
v3:
- a bit reworked bio handling - allocate new bio only if the previous
is full
drivers/vhost/Kconfig | 12 +
drivers/vhost/Makefile | 3 +
drivers/vhost/blk.c | 828 +++++++++++++++++++++++++++++++++++++
include/uapi/linux/vhost.h | 5 +
4 files changed, 848 insertions(+)
create mode 100644 drivers/vhost/blk.c
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 587fbae06182..e1389bf0c10b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -89,4 +89,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
If unsure, say "N".
+config VHOST_BLK
+ tristate "Host kernel accelerator for virtio-blk"
+ depends on BLOCK && EVENTFD
+ select VHOST
+ default n
+ help
+ This kernel module can be loaded in host kernel to accelerate
+ guest vm with virtio-blk driver.
+
+ To compile this driver as a module, choose M here: the module will
+ be called vhost_blk.
+
endif
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index f3e1897cce85..c76cc4f5fcd8 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_VHOST) += vhost.o
obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
vhost_iotlb-y := iotlb.o
+
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000000000000..933c9c50b0a6
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011 Taobao, Inc.
+ * Author: Liu Yuan <tailai...@taobao.com>
+ *
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Author: Asias He <as...@redhat.com>
+ *
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ * Author: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com>
+ *
+ * virtio-blk host kernel accelerator.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/vhost.h>
+#include <linux/virtio_blk.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/llist.h>
+
+#include "vhost.h"
+
+enum {
+ VHOST_BLK_FEATURES = VHOST_FEATURES |
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) |
+ (1ULL << VIRTIO_BLK_F_MQ) |
+ (1ULL << VIRTIO_BLK_F_FLUSH),
+};
+
+/*
+ * Max number of bytes transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others.
+ */
+#define VHOST_DEV_WEIGHT 0x80000
+
+/*
+ * Max number of packets transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others with
+ * pkts.
+ */
+#define VHOST_DEV_PKT_WEIGHT 256
+
+#define VHOST_BLK_VQ_MAX 8
+
+#define VHOST_MAX_METADATA_IOV 1
+
+#define VHOST_BLK_SECTOR_BITS 9
+#define VHOST_BLK_SECTOR_SIZE (1 << VHOST_BLK_SECTOR_BITS)
+#define VHOST_BLK_SECTOR_MASK (VHOST_BLK_SECTOR_SIZE - 1)
+
+struct req_page_list {
+ struct page **pages;
+ int pages_nr;
+};
+
+#define NR_INLINE 16
+
+struct vhost_blk_req {
+ struct req_page_list inline_pl[NR_INLINE];
+ struct page *inline_page[NR_INLINE];
+ struct bio *inline_bio[NR_INLINE];
+ struct req_page_list *pl;
+ int during_flush;
+ bool use_inline;
+
+ struct llist_node llnode;
+
+ struct vhost_blk *blk;
+
+ struct iovec *iov;
+ int iov_nr;
+
+ struct bio **bio;
+ atomic_t bio_nr;
+
+ struct iovec status[VHOST_MAX_METADATA_IOV];
+
+ sector_t sector;
+ int bi_opf;
+ u16 head;
+ long len;
+ int bio_err;
+
+ struct vhost_blk_vq *blk_vq;
+};
+
+struct vhost_blk_vq {
+ struct vhost_virtqueue vq;
+ struct vhost_blk_req *req;
+ struct iovec iov[UIO_MAXIOV];
+ struct llist_head llhead;
+ struct vhost_work work;
+};
+
+struct vhost_blk {
+ wait_queue_head_t flush_wait;
+ struct vhost_blk_vq vqs[VHOST_BLK_VQ_MAX];
+ atomic_t req_inflight[2];
+ spinlock_t flush_lock;
+ struct vhost_dev dev;
+ int during_flush;
+ struct file *backend;
+ int index;
+};
+
+static int gen;
+
+static int move_iovec(struct iovec *from, struct iovec *to,
+ size_t len, int iov_count_from, int iov_count_to)
+{
+ int moved_seg = 0, spent_seg = 0;
+ size_t size;
+
+ while (len && spent_seg < iov_count_from && moved_seg <
iov_count_to) {
+ if (from->iov_len == 0) {
+ ++from;
+ ++spent_seg;
+ continue;
+ }
+ size = min(from->iov_len, len);
+ to->iov_base = from->iov_base;
+ to->iov_len = size;
+ from->iov_len -= size;
+ from->iov_base += size;
+ len -= size;
+ ++from;
+ ++to;
+ ++moved_seg;
+ ++spent_seg;
+ }
+
+ return len ? -1 : moved_seg;
+}
+
+static inline int iov_num_pages(struct iovec *iov)
+{
+ return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+ ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8
status)
+{
+ struct iov_iter iter;
+ int ret;
+
+ iov_iter_init(&iter, WRITE, req->status, ARRAY_SIZE(req->status),
sizeof(status));
+ ret = copy_to_iter(&status, sizeof(status), &iter);
+ if (ret != sizeof(status)) {
+ vq_err(&req->blk_vq->vq, "Failed to write status\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static void vhost_blk_req_done(struct bio *bio)
+{
+ struct vhost_blk_req *req = bio->bi_private;
+ struct vhost_blk *blk = req->blk;
+
+ req->bio_err = blk_status_to_errno(bio->bi_status);
+
+ if (atomic_dec_and_test(&req->bio_nr)) {
+ llist_add(&req->llnode, &req->blk_vq->llhead);
+ vhost_work_queue(&blk->dev, &req->blk_vq->work);
+ }
+
+ bio_put(bio);
+}
+
+static void vhost_blk_req_umap(struct vhost_blk_req *req)
+{
+ struct req_page_list *pl;
+ int i, j;
+
+ if (req->pl) {
+ for (i = 0; i < req->iov_nr; i++) {
+ pl = &req->pl[i];
+
+ for (j = 0; j < pl->pages_nr; j++) {
+ if (!req->bi_opf)
+ set_page_dirty_lock(pl->pages[j]);
+ put_page(pl->pages[j]);
+ }
+ }
+ }
+
+ if (!req->use_inline)
+ kfree(req->pl);
+}
+
+static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
+ struct block_device *bdev)
+{
+ struct bio *bio;
+
+ req->use_inline = true;
+ req->pl = NULL;
+ req->bio = req->inline_bio;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_iter.bi_sector = req->sector;
+ bio_set_dev(bio, bdev);
+ bio->bi_private = req;
+ bio->bi_end_io = vhost_blk_req_done;
+ bio->bi_opf = req->bi_opf;
+ req->bio[0] = bio;
+
+ atomic_set(&req->bio_nr, 1);
+
+ return 0;
+}
+
+static struct page **vhost_blk_prepare_req(struct vhost_blk_req *req,
+ int total_pages, int iov_nr)
+{
+ int pl_len, page_len, bio_len;
+ void *buf;
+
+ req->use_inline = false;
+ pl_len = iov_nr * sizeof(req->pl[0]);
+ page_len = total_pages * sizeof(struct page *);
+ bio_len = total_pages * sizeof(struct bio *);
+
+ buf = kmalloc(pl_len + page_len + bio_len, GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ req->pl = buf;
+ req->bio = buf + pl_len + page_len;
+
+ return buf + pl_len;
+}
+
+static int vhost_blk_bio_make(struct vhost_blk_req *req,
+ struct block_device *bdev)
+{
+ int pages_nr_total, i, j, ret;
+ struct iovec *iov = req->iov;
+ int iov_nr = req->iov_nr;
+ struct page **pages, *page;
+ struct bio *bio = NULL;
+ int bio_nr = 0;
+
+ if (unlikely(req->bi_opf == REQ_OP_FLUSH))
+ return vhost_blk_bio_make_simple(req, bdev);
+
+ pages_nr_total = 0;
+ for (i = 0; i < iov_nr; i++)
+ pages_nr_total += iov_num_pages(&iov[i]);
+
+ if (pages_nr_total > NR_INLINE) {
+ pages = vhost_blk_prepare_req(req, pages_nr_total, iov_nr);
+ if (!pages)
+ return -ENOMEM;
+ } else {
+ req->use_inline = true;
+ req->pl = req->inline_pl;
+ pages = req->inline_page;
+ req->bio = req->inline_bio;
+ }
+
+ req->iov_nr = 0;
+ for (i = 0; i < iov_nr; i++) {
+ int pages_nr = iov_num_pages(&iov[i]);
+ unsigned long iov_base, iov_len;
+ struct req_page_list *pl;
+
+ iov_base = (unsigned long)iov[i].iov_base;
+ iov_len = (unsigned long)iov[i].iov_len;
+
+ ret = get_user_pages_fast(iov_base, pages_nr,
+ !req->bi_opf, pages);
+ if (ret != pages_nr)
+ goto fail;
+
+ req->iov_nr++;
+ pl = &req->pl[i];
+ pl->pages_nr = pages_nr;
+ pl->pages = pages;
+
+ for (j = 0; j < pages_nr; j++) {
+ unsigned int off, len, pos;
+
+ page = pages[j];
+ off = iov_base & ~PAGE_MASK;
+ len = PAGE_SIZE - off;
+ if (len > iov_len)
+ len = iov_len;
+
+ while (!bio || !bio_add_page(bio, page, len, off)) {
+ bio = bio_alloc(GFP_KERNEL, pages_nr);
+ if (!bio)
+ goto fail;
+ bio->bi_iter.bi_sector = req->sector;
+ bio_set_dev(bio, bdev);
+ bio->bi_private = req;
+ bio->bi_end_io = vhost_blk_req_done;
+ bio->bi_opf = req->bi_opf;
+ req->bio[bio_nr++] = bio;
+ }
+
+ iov_base += len;
+ iov_len -= len;
+
+ pos = (iov_base & VHOST_BLK_SECTOR_MASK) + iov_len;
+ req->sector += pos >> VHOST_BLK_SECTOR_BITS;
+ }
+
+ pages += pages_nr;
+ }
+ atomic_set(&req->bio_nr, bio_nr);
+ return 0;
+
+fail:
+ for (i = 0; i < bio_nr; i++)
+ bio_put(req->bio[i]);
+ vhost_blk_req_umap(req);
+ return -ENOMEM;
+}
+
+static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
+{
+ struct blk_plug plug;
+ int i, bio_nr;
+
+ bio_nr = atomic_read(&req->bio_nr);
+ blk_start_plug(&plug);
+ for (i = 0; i < bio_nr; i++)
+ submit_bio(req->bio[i]);
+
+ blk_finish_plug(&plug);
+}
+
+static int vhost_blk_req_submit(struct vhost_blk_req *req, struct
file *file)
+{
+
+ struct inode *inode = file->f_mapping->host;
+ struct block_device *bdev = I_BDEV(inode);
+ int ret;
+
+ ret = vhost_blk_bio_make(req, bdev);
+ if (ret < 0)
+ return ret;
+
+ vhost_blk_bio_send(req);
+
+ spin_lock(&req->blk->flush_lock);
+ req->during_flush = req->blk->during_flush;
+ atomic_inc(&req->blk->req_inflight[req->during_flush]);
+ spin_unlock(&req->blk->flush_lock);
+
+ return ret;
+}
+
+static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
+ struct virtio_blk_outhdr *hdr,
+ u16 head, u16 total_iov_nr,
+ struct file *file)
+{
+ struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk,
dev);
+ struct vhost_blk_vq *blk_vq = container_of(vq, struct
vhost_blk_vq, vq);
+ unsigned char id[VIRTIO_BLK_ID_BYTES];
+ struct vhost_blk_req *req;
+ struct iov_iter iter;
+ int ret, len;
+ u8 status;
+
+ req = &blk_vq->req[head];
+ req->blk_vq = blk_vq;
+ req->head = head;
+ req->blk = blk;
+ req->sector = hdr->sector;
+ req->iov = blk_vq->iov;
+
+ req->len = iov_length(vq->iov, total_iov_nr) - sizeof(status);
+ req->iov_nr = move_iovec(vq->iov, req->iov, req->len,
total_iov_nr,
+ ARRAY_SIZE(blk_vq->iov));
+
+ ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr,
+ ARRAY_SIZE(req->status));
+ if (ret < 0 || req->iov_nr < 0)
+ return -EINVAL;
+
+ switch (hdr->type) {
+ case VIRTIO_BLK_T_OUT:
+ req->bi_opf = REQ_OP_WRITE;
+ ret = vhost_blk_req_submit(req, file);
+ break;
+ case VIRTIO_BLK_T_IN:
+ req->bi_opf = REQ_OP_READ;
+ ret = vhost_blk_req_submit(req, file);
+ break;
+ case VIRTIO_BLK_T_FLUSH:
+ req->bi_opf = REQ_OP_FLUSH;
+ ret = vhost_blk_req_submit(req, file);
+ break;
+ case VIRTIO_BLK_T_GET_ID:
+ len = snprintf(id, VIRTIO_BLK_ID_BYTES, "vhost-blk%d",
blk->index);
+ iov_iter_init(&iter, WRITE, req->iov, req->iov_nr, req->len);
+ ret = copy_to_iter(id, len, &iter);
+ status = ret != len ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+ ret = vhost_blk_set_status(req, status);
+ if (ret)
+ break;
+ vhost_add_used_and_signal(&blk->dev, vq, head, len);
+ break;
+ default:
+ vq_err(vq, "Unsupported request type %d\n", hdr->type);
+ status = VIRTIO_BLK_S_UNSUPP;
+ ret = vhost_blk_set_status(req, status);
+ if (ret)
+ break;
+ vhost_add_used_and_signal(&blk->dev, vq, head, 0);
+ }
+
+ return ret;
+}
+
+static void vhost_blk_handle_guest_kick(struct vhost_work *work)
+{
+ struct virtio_blk_outhdr hdr;
+ struct vhost_blk_vq *blk_vq;
+ struct vhost_virtqueue *vq;
+ struct iovec hdr_iovec[VHOST_MAX_METADATA_IOV];
+ struct vhost_blk *blk;
+ struct iov_iter iter;
+ int in, out, ret;
+ struct file *f;
+ u16 head;
+
+ vq = container_of(work, struct vhost_virtqueue, poll.work);
+ blk = container_of(vq->dev, struct vhost_blk, dev);
+ blk_vq = container_of(vq, struct vhost_blk_vq, vq);
+
+ f = vhost_vq_get_backend(vq);
+ if (!f)
+ return;
+
+ vhost_disable_notify(&blk->dev, vq);
+ for (;;) {
+ head = vhost_get_vq_desc(vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (unlikely(head < 0))
+ break;
+
+ if (unlikely(head == vq->num)) {
+ if (unlikely(vhost_enable_notify(&blk->dev, vq))) {
+ vhost_disable_notify(&blk->dev, vq);
+ continue;
+ }