Re: [PATCH 1/1] Add vhost_blk driver

Michael S. Tsirkin Fri, 02 Nov 2018 11:37:23 -0700

On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh wrote:
> This driver accelerates host side of virtio-blk.
> 
> Signed-off-by: Vitaly Mayatskikh <[email protected]>
> ---
>  drivers/vhost/Kconfig  |  13 ++
>  drivers/vhost/Makefile |   3 +
>  drivers/vhost/blk.c    | 510 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 526 insertions(+)
>  create mode 100644 drivers/vhost/blk.c
> 
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index b580885243f7..c4980d6af0ea 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
>         adds some overhead, it is disabled by default.
>  
>         If unsure, say "N".
> +
> +config VHOST_BLK
> +     tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
> +     depends on BLOCK && EVENTFD
> +     select VHOST
> +     default n
> +     help
> +      This kernel module can be loaded in host kernel to accelerate
> +      guest block with virtio_blk. Not to be confused with virtio_blk
> +      module itself which needs to be loaded in guest kernel.
> +
> +      To compile this driver as a module, choose M here: the module will
> +      be called vhost_blk.
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6c6df24f770c..c8be36cd9214 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
>  obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
>  vhost_vsock-y := vsock.o
>  
> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> +vhost_blk-y := blk.o
> +
>  obj-$(CONFIG_VHOST_RING) += vringh.o
>  
>  obj-$(CONFIG_VHOST)  += vhost.o
> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> new file mode 100644
> index 000000000000..aefb9a61fa0f
> --- /dev/null
> +++ b/drivers/vhost/blk.c
> @@ -0,0 +1,510 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 IBM Corporation
> + * Author: Vitaly Mayatskikh <[email protected]>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + *
> + * virtio-blk server in host kernel.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/virtio_blk.h>
> +#include <linux/vhost.h>
> +#include <linux/fs.h>
> +#include "vhost.h"
> +
> +enum {
> +     VHOST_BLK_FEATURES =
> +     VHOST_FEATURES |
> +     (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> +     (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> +     (1ULL << VIRTIO_RING_F_EVENT_IDX) |
> +     (1ULL << VIRTIO_BLK_F_MQ)
> +};
> +
> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
> +
> +enum {
> +     VHOST_BLK_VQ_MAX = 16,
> +     VHOST_BLK_VQ_MAX_REQS = 128,
> +};
> +
> +struct vhost_blk_req {
> +     struct llist_node list;
> +     int index;
> +     struct vhost_blk_queue *q;
> +     struct virtio_blk_outhdr hdr;
> +     struct iovec *out_iov;
> +     struct iovec *in_iov;
> +     u8 out_num;
> +     u8 in_num;
> +     long len;
> +     struct kiocb iocb;
> +     struct iov_iter i;
> +     int res;
> +     void __user *status;
> +};
> +
> +struct vhost_blk_queue {
> +     int index;
> +     struct vhost_blk *blk;
> +     struct vhost_virtqueue vq;
> +     struct vhost_work w;
> +     struct llist_head wl;
> +     struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
> +};
> +
> +struct vhost_blk {
> +     struct vhost_dev dev;
> +     struct file *backend;
> +     int num_queues;
> +     struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
> +     struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
> +};
> +
> +static void vhost_blk_flush(struct vhost_blk *blk)
> +{
> +     int i;
> +
> +     for (i = 0; i < blk->num_queues; i++)
> +             vhost_poll_flush(&blk->queue[i].vq.poll);
> +}
> +
> +
> +static void vhost_blk_stop(struct vhost_blk *blk)
> +{
> +     struct vhost_virtqueue *vq;
> +     int i;
> +
> +     for (i = 0; i < blk->num_queues; i++) {
> +             vq = &blk->queue[i].vq;
> +             mutex_lock(&vq->mutex);
> +             rcu_assign_pointer(vq->private_data, NULL);
> +             mutex_unlock(&vq->mutex);
> +     }
> +}
> +
> +static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char 
> status)
> +{
> +     int ret;
> +     int len = req->len;
> +
> +     pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
> +              req->q->index, req->index, status, len);
> +     ret = put_user(status, (unsigned char __user *)req->status);


I'd make it u8 and not unsigned char. Also why don't you change
req->status type so you don't need a cast?

> +
> +     WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
> +          req->q->index, req->index);

kernel warnings and debug messages that are guest-triggerable lead to
disk full errors on the host. applies elsewhere too. you want traces
instead.

> +
> +     vhost_add_used(&req->q->vq, req->index, len);


this can fail too.

> +
> +     return ret;
> +}
> +
> +static void vhost_blk_io_done_work(struct vhost_work *w)
> +{
> +     struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
> +     struct llist_node *node;
> +     struct vhost_blk_req *req, *tmp;
> +
> +     node = llist_del_all(&q->wl);
> +     llist_for_each_entry_safe(req, tmp, node, list) {
> +             vhost_blk_req_done(req, req->res);
> +     }
> +     vhost_signal(&q->blk->dev, &q->vq);
> +}
> +
> +static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
> +{
> +     struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
> +                                              iocb);
> +
> +     pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
> +              req->q->index, req->index, ret, ret2);
> +
> +     req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
> +     llist_add(&req->list, &req->q->wl);
> +     vhost_vq_work_queue(&req->q->vq, &req->q->w);
> +}
> +
> +static int vhost_blk_req_handle(struct vhost_blk_req *req)
> +{
> +     struct vhost_blk *blk = req->q->blk;
> +     struct vhost_virtqueue *vq = &req->q->vq;
> +     int type = le32_to_cpu(req->hdr.type);
> +     int ret;
> +     u8 status;
> +
> +     if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> +             bool write = (type == VIRTIO_BLK_T_OUT);
> +             int nr_seg = (write ? req->out_num : req->in_num) - 1;
> +             unsigned long sector = le64_to_cpu(req->hdr.sector);
> +             ssize_t len, rem_len;
> +
> +             if (!req->q->blk->backend) {
> +                     vq_err(vq, "blk %p no backend!\n", req->q->blk);
> +                     ret = -EINVAL;
> +                     goto out_err;
> +             }
> +
> +             len = iov_length(&vq->iov[1], nr_seg);
> +             pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
> +                      __func__, current->pid, current->comm,
> +                      write ? "WRITE" : "READ", req->hdr.sector, len);
> +
> +             req->len = len;
> +             rem_len = len;
> +             iov_iter_init(&req->i, (write ? WRITE : READ),
> +                           write ? &req->out_iov[0] : &req->in_iov[0],
> +                           nr_seg, len);
> +
> +             req->iocb.ki_pos = sector << 9;
> +             req->iocb.ki_filp = blk->backend;
> +             req->iocb.ki_complete = vhost_blk_iocb_complete;
> +             req->iocb.ki_flags = IOCB_DIRECT;
> +
> +             if (write)
> +                     ret = call_write_iter(blk->backend, &req->iocb,
> +                                           &req->i);
> +             else
> +                     ret = call_read_iter(blk->backend, &req->iocb,
> +                                          &req->i);
> +
> +             if (ret != -EIOCBQUEUED)
> +                     vhost_blk_iocb_complete(&req->iocb, ret, 0);
> +
> +             ret = 0;
> +             goto out;
> +     }
> +
> +     if (type == VIRTIO_BLK_T_GET_ID) {
> +             char s[] = "vhost_blk";

Isn't this supposed to return the serial #?

> +             size_t len = min_t(size_t, req->in_iov[0].iov_len,
> +                                strlen(s));
> +
> +             ret = copy_to_user(req->in_iov[0].iov_base, s, len);

I don't think we should assume there's no scatter list here.

> +             status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> +             if (put_user(status, (unsigned char __user *)req->status)) {
> +                     ret = -EFAULT;
> +                     goto out_err;
> +             }
> +             vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
> +             ret = 0;
> +             goto out;
> +     } else {
> +             pr_warn("Unsupported request type %d\n", type);
> +             vhost_discard_vq_desc(vq, 1);
> +             ret = -EINVAL;
> +             return ret;
> +     }
> +out_err:
> +     vhost_discard_vq_desc(vq, 1);
> +out:
> +     return ret;
> +}
> +
> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
> +{
> +     struct vhost_virtqueue *vq;
> +     struct vhost_blk_queue *q;
> +     struct vhost_blk *blk;
> +     struct vhost_blk_req *req;
> +     int in, out;
> +     int head;
> +
> +     vq = container_of(work, struct vhost_virtqueue, poll.work);
> +     q = container_of(vq, struct vhost_blk_queue, vq);
> +     blk = container_of(vq->dev, struct vhost_blk, dev);
> +
> +     vhost_disable_notify(&blk->dev, vq);
> +     for (;;) {
> +             in = out = -1;
> +
> +             head = vhost_get_vq_desc(vq, vq->iov,
> +                                      ARRAY_SIZE(vq->iov),
> +                                      &out, &in, NULL, NULL);
> +
> +             if (head < 0)
> +                     break;
> +
> +             if (head == vq->num) {
> +                     if (vhost_enable_notify(&blk->dev, vq)) {
> +                             vhost_disable_notify(&blk->dev, vq);
> +                             continue;
> +                     }
> +                     break;
> +             }
> +
> +             req = &q->req[head];
> +             req->index = head;
> +             req->out_num = out;
> +             req->in_num = in;
> +             req->out_iov = &vq->iov[1];
> +             req->in_iov = &vq->iov[out];
> +             req->status = vq->iov[out + in - 1].iov_base;

Shouldn't we validate that there's actually an in?

> +
> +             if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
> +                                sizeof(req->hdr))) {
> +                     vq_err(vq, "Failed to get block header!\n");
> +                     vhost_discard_vq_desc(vq, 1);
> +                     continue;
> +             }

It's better to avoid assuming that header is in a single iov entry,
use an iterator.

> +             if (vhost_blk_req_handle(req) < 0)
> +                     break;
> +     }
> +}
> +
> +static int vhost_blk_open(struct inode *inode, struct file *file)
> +{
> +     struct vhost_blk *blk;
> +     struct vhost_blk_queue *q;
> +     int i, j;
> +
> +     blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
> +     if (!blk)
> +             return -ENOMEM;
> +
> +     for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
> +             q = &blk->queue[i];
> +             q->index = i;
> +             q->blk = blk;
> +             q->vq.handle_kick = vhost_blk_handle_guest_kick;
> +             vhost_work_init(&q->w, vhost_blk_io_done_work);
> +             blk->vqs[i] = &q->vq;
> +             for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
> +                     q->req[j].index = j;
> +                     q->req[j].q = q;
> +             }
> +     }
> +     vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
> +                    VHOST_BLK_VQ_MAX);
> +     file->private_data = blk;
> +
> +     return 0;
> +}
> +
> +static int vhost_blk_release(struct inode *inode, struct file *f)
> +{
> +     struct vhost_blk *blk = f->private_data;
> +
> +     vhost_blk_stop(blk);
> +     mutex_lock(&blk->dev.mutex);
> +     vhost_blk_flush(blk);
> +     vhost_dev_stop(&blk->dev);
> +     vhost_dev_cleanup(&blk->dev);
> +     vhost_blk_flush(blk);
> +
> +     if (blk->backend) {
> +             fput(blk->backend);
> +             blk->backend = NULL;
> +     }
> +
> +     mutex_unlock(&blk->dev.mutex);
> +     kvfree(blk);
> +
> +     return 0;
> +}
> +
> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
> +{
> +     int i;
> +     int ret = -EFAULT;
> +
> +     mutex_lock(&blk->dev.mutex);
> +     if ((features & (1 << VHOST_F_LOG_ALL)) &&
> +         !vhost_log_access_ok(&blk->dev))
> +             goto out_unlock;
> +
> +     if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
> +             if (vhost_init_device_iotlb(&blk->dev, true))
> +                     goto out_unlock;
> +     }
> +
> +     for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
> +             struct vhost_virtqueue *vq = blk->vqs[i];
> +
> +             mutex_lock(&vq->mutex);
> +             vq->acked_features = features & VHOST_BLK_FEATURES;
> +             mutex_unlock(&vq->mutex);
> +     }
> +     ret = 0;
> +out_unlock:
> +     mutex_unlock(&blk->dev.mutex);
> +
> +     return ret;
> +}
> +
> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
> +{
> +     long err;
> +     struct vhost_umem *umem;
> +
> +     mutex_lock(&blk->dev.mutex);
> +     err = vhost_dev_check_owner(&blk->dev);
> +     if (err)
> +             goto done;
> +     umem = vhost_dev_reset_owner_prepare();
> +     if (!umem) {
> +             err = -ENOMEM;
> +             goto done;
> +     }
> +     vhost_blk_stop(blk);
> +     vhost_blk_flush(blk);
> +     vhost_dev_reset_owner(&blk->dev, umem);
> +done:
> +     mutex_unlock(&blk->dev.mutex);
> +     return err;
> +}
> +
> +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
> +{
> +     struct file *backend;
> +     int ret, i;
> +     struct vhost_virtqueue *vq;
> +
> +     mutex_lock(&blk->dev.mutex);
> +     ret = vhost_dev_check_owner(&blk->dev);
> +     if (ret)
> +             goto out_dev;
> +
> +     backend = fget(fd);
> +     if (IS_ERR(backend)) {
> +             ret = PTR_ERR(backend);
> +             goto out_dev;
> +     }
> +
> +     if (backend == blk->backend) {
> +             ret = 0;
> +             goto out_file;
> +     }
> +
> +     if (blk->backend)
> +             fput(blk->backend);
> +     blk->backend = backend;
> +     for (i = 0; i < blk->num_queues; i++) {
> +             vq = &blk->queue[i].vq;
> +             if (!vhost_vq_access_ok(vq)) {
> +                     ret = -EFAULT;
> +                     goto out_file;
> +             }
> +             mutex_lock(&vq->mutex);
> +             rcu_assign_pointer(vq->private_data, backend);
> +             ret = vhost_vq_init_access(vq);
> +             mutex_unlock(&vq->mutex);
> +             if (ret) {
> +                     pr_err("vhost_vq_init_access failed: %d\n", ret);
> +                     goto out_file;
> +             }
> +
> +     }
> +     ret = 0;
> +     goto out_dev;
> +out_file:
> +     fput(backend);
> +     blk->backend = NULL;
> +out_dev:
> +     mutex_unlock(&blk->dev.mutex);
> +     vhost_blk_flush(blk);
> +     return ret;
> +}
> +
> +static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
> +                              void __user *argp)
> +{
> +     long ret;
> +
> +     mutex_lock(&blk->dev.mutex);
> +     ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
> +     if (ret == -ENOIOCTLCMD)
> +             ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
> +     else
> +             vhost_blk_flush(blk);
> +     mutex_unlock(&blk->dev.mutex);
> +     return ret;
> +}
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> +                         unsigned long arg)
> +{
> +     struct vhost_blk *blk = f->private_data;
> +     void __user *argp = (void __user *)arg;
> +     int fd;
> +     u64 __user *featurep = argp;
> +     u64 features;
> +     long ret;
> +     struct vhost_vring_state s;
> +
> +     switch (ioctl) {
> +     case VHOST_SET_MEM_TABLE:
> +             vhost_blk_stop(blk);
> +             ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +             break;
> +     case VHOST_SET_VRING_NUM:
> +             if (copy_from_user(&s, argp, sizeof(s)))
> +                     return -EFAULT;
> +             ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +             if (!ret)
> +                     blk->num_queues = s.index + 1;
> +             break;
> +     case VHOST_BLK_SET_BACKEND:
> +             if (copy_from_user(&fd, argp, sizeof(fd)))
> +                     return -EFAULT;
> +             ret = vhost_blk_set_backend(blk, fd);
> +             break;
> +     case VHOST_GET_FEATURES:
> +             features = VHOST_BLK_FEATURES;
> +             if (copy_to_user(featurep, &features, sizeof(features)))
> +                     return -EFAULT;
> +             ret = 0;
> +             break;
> +     case VHOST_SET_FEATURES:
> +             if (copy_from_user(&features, featurep, sizeof(features)))
> +                     return -EFAULT;
> +             if (features & ~VHOST_BLK_FEATURES)
> +                     return -EOPNOTSUPP;
> +             ret = vhost_blk_set_features(blk, features);
> +             break;
> +     case VHOST_RESET_OWNER:
> +             ret = vhost_blk_reset_owner(blk);
> +             break;
> +     default:
> +             ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> +             break;
> +     }
> +     return ret;
> +}
> +
> +static const struct file_operations vhost_blk_fops = {
> +     .owner          = THIS_MODULE,
> +     .open           = vhost_blk_open,
> +     .release        = vhost_blk_release,
> +     .llseek         = noop_llseek,
> +     .unlocked_ioctl = vhost_blk_ioctl,
> +};
> +
> +static struct miscdevice vhost_blk_misc = {
> +     MISC_DYNAMIC_MINOR,
> +     "vhost-blk",
> +     &vhost_blk_fops,
> +};
> +
> +static int vhost_blk_init(void)
> +{
> +     return misc_register(&vhost_blk_misc);
> +}
> +module_init(vhost_blk_init);
> +
> +static void vhost_blk_exit(void)
> +{
> +     misc_deregister(&vhost_blk_misc);
> +}
> +
> +module_exit(vhost_blk_exit);
> +
> +MODULE_VERSION("1.0");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Vitaly Mayatskikh");
> +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
> +MODULE_ALIAS("devname:vhost-blk");
> -- 
> 2.17.1

Re: [PATCH 1/1] Add vhost_blk driver

Reply via email to