On Sun, 17 Jul 2011, Michael S. Tsirkin wrote:

> From: Shirley Ma <[email protected]>
> 
> This adds experimental zero copy support in vhost-net,
> disabled by default. To enable, set the zerocopytx
> module option to 1.
> 
> This patch maintains the outstanding userspace buffers in the
> sequence it is delivered to vhost. The outstanding userspace buffers
> will be marked as done once the lower device buffers DMA has finished.
> This is monitored through last reference of kfree_skb callback. Two
> buffer indices are used for this purpose.
> 
> The vhost-net device passes the userspace buffers info to lower device
> skb through message control. DMA done status check and guest
> notification are handled by handle_tx: in the worst case is all buffers
> in the vq are in pending/done status, so we need to notify guest to
> release DMA done buffers first before we get any new buffers from the
> vq.
> 
> One known problem is that if the guest stops submitting
> buffers, buffers might never get used until some
> further action, e.g. device reset. This does not
> seem to affect linux guests.
> 
> Signed-off-by: Shirley <[email protected]>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> 
> The below is what I came up with. We add the feature enabled
> by default for now as there are known issues, 

You mean "disabled" - right?


> but some
> guests can benefit so there's value in putting this
> in tree, to help the code get wider testing.
> 
>  drivers/vhost/net.c   |   73 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   85 
> +++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |   29 +++++++++++++++++
>  3 files changed, 186 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index e224a92..226ca6b 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -12,6 +12,7 @@
>  #include <linux/virtio_net.h>
>  #include <linux/miscdevice.h>
>  #include <linux/module.h>
> +#include <linux/moduleparam.h>
>  #include <linux/mutex.h>
>  #include <linux/workqueue.h>
>  #include <linux/rcupdate.h>
> @@ -28,10 +29,18 @@
>  
>  #include "vhost.h"
>  
> +static int zcopytx;
> +module_param(zcopytx, int, 0444);

Should everyone be able to read this? How about "0440" just to be 
paranoid? or?

> +MODULE_PARM_DESC(lnksts, "Enable Zero Copy Transmit");
> +
>  /* Max number of bytes transferred before requeueing the job.
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +/* MAX number of TX used buffers for outstanding zerocopy */
> +#define VHOST_MAX_PEND 128
> +#define VHOST_GOODCOPY_LEN 256
> +
>  enum {
>       VHOST_NET_VQ_RX = 0,
>       VHOST_NET_VQ_TX = 1,
> @@ -54,6 +63,11 @@ struct vhost_net {
>       enum vhost_net_poll_state tx_poll_state;
>  };
>  
> +static bool vhost_sock_zcopy(struct socket *sock)
> +{
> +     return unlikely(zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY);
> +}
> +
>  /* Pop first len bytes from iovec. Return number of segments used. */
>  static int move_iovec_hdr(struct iovec *from, struct iovec *to,
>                         size_t len, int iov_count)
> @@ -129,6 +143,8 @@ static void handle_tx(struct vhost_net *net)
>       int err, wmem;
>       size_t hdr_size;
>       struct socket *sock;
> +     struct vhost_ubuf_ref *uninitialized_var(ubufs);
> +     bool zcopy;
>  
>       /* TODO: check that we are running from vhost_worker? */
>       sock = rcu_dereference_check(vq->private_data, 1);
> @@ -149,8 +165,13 @@ static void handle_tx(struct vhost_net *net)
>       if (wmem < sock->sk->sk_sndbuf / 2)
>               tx_poll_stop(net);
>       hdr_size = vq->vhost_hlen;
> +     zcopy = vhost_sock_zcopy(sock);
>  
>       for (;;) {
> +             /* Release DMAs done buffers first */
> +             if (zcopy)
> +                     vhost_zerocopy_signal_used(vq);
> +
>               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>                                        ARRAY_SIZE(vq->iov),
>                                        &out, &in,
> @@ -166,6 +187,12 @@ static void handle_tx(struct vhost_net *net)
>                               set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>                               break;
>                       }
> +                     /* If more outstanding DMAs, queue the work */
> +                     if (vq->upend_idx - vq->done_idx > VHOST_MAX_PEND) {
> +                             tx_poll_start(net, sock);
> +                             set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
> +                             break;
> +                     }
>                       if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>                               vhost_disable_notify(&net->dev, vq);
>                               continue;
> @@ -188,9 +215,39 @@ static void handle_tx(struct vhost_net *net)
>                              iov_length(vq->hdr, s), hdr_size);
>                       break;
>               }
> +             /* use msg_control to pass vhost zerocopy ubuf info to skb */
> +             if (zcopy) {
> +                     vq->heads[vq->upend_idx].id = head;
> +                     if (len < VHOST_GOODCOPY_LEN) {
> +                             /* copy don't need to wait for DMA done */
> +                             vq->heads[vq->upend_idx].len =
> +                                                     VHOST_DMA_DONE_LEN;
> +                             msg.msg_control = NULL;
> +                             msg.msg_controllen = 0;
> +                             ubufs = NULL;
> +                     } else {
> +                             struct ubuf_info *ubuf = &vq->ubuf_info[head];
> +
> +                             vq->heads[vq->upend_idx].len = len;
> +                             ubuf->callback = vhost_zerocopy_callback;
> +                             ubuf->arg = vq->ubufs;
> +                             ubuf->desc = vq->upend_idx;
> +                             msg.msg_control = ubuf;
> +                             msg.msg_controllen = sizeof(ubuf);
> +                             ubufs = vq->ubufs;
> +                             kref_get(&ubufs->kref);
> +                     }
> +                     vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
> +             }
>               /* TODO: Check specific error and bomb out unless ENOBUFS? */
>               err = sock->ops->sendmsg(NULL, sock, &msg, len);
>               if (unlikely(err < 0)) {
> +                     if (zcopy) {
> +                             if (ubufs)
> +                                     vhost_ubuf_put(ubufs);
> +                             vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
> +                                     UIO_MAXIOV;
> +                     }
>                       vhost_discard_vq_desc(vq, 1);
>                       tx_poll_start(net, sock);
>                       break;
> @@ -198,7 +255,8 @@ static void handle_tx(struct vhost_net *net)
>               if (err != len)
>                       pr_debug("Truncated TX packet: "
>                                " len %d != %zd\n", err, len);
> -             vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +             if (!zcopy)
> +                     vhost_add_used_and_signal(&net->dev, vq, head, 0);
>               total_len += len;
>               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>                       vhost_poll_queue(&vq->poll);
> @@ -603,6 +661,7 @@ static long vhost_net_set_backend(struct vhost_net *n, 
> unsigned index, int fd)
>  {
>       struct socket *sock, *oldsock;
>       struct vhost_virtqueue *vq;
> +     struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
>       int r;
>  
>       mutex_lock(&n->dev.mutex);
> @@ -632,6 +691,13 @@ static long vhost_net_set_backend(struct vhost_net *n, 
> unsigned index, int fd)
>       oldsock = rcu_dereference_protected(vq->private_data,
>                                           lockdep_is_held(&vq->mutex));
>       if (sock != oldsock) {
> +             ubufs = vhost_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock));
> +             if (IS_ERR(ubufs)) {
> +                     r = PTR_ERR(ubufs);
> +                     goto err_ubufs;
> +             }
> +             oldubufs = vq->ubufs;
> +             vq->ubufs = ubufs;
>               vhost_net_disable_vq(n, vq);
>               rcu_assign_pointer(vq->private_data, sock);
>               vhost_net_enable_vq(n, vq);
> @@ -639,6 +705,9 @@ static long vhost_net_set_backend(struct vhost_net *n, 
> unsigned index, int fd)
>  
>       mutex_unlock(&vq->mutex);
>  
> +     if (oldubufs)
> +             vhost_ubuf_put_and_wait(oldubufs);
> +
>       if (oldsock) {
>               vhost_net_flush_vq(n, index);
>               fput(oldsock->file);
> @@ -647,6 +716,8 @@ static long vhost_net_set_backend(struct vhost_net *n, 
> unsigned index, int fd)
>       mutex_unlock(&n->dev.mutex);
>       return 0;
>  
> +err_ubufs:
> +     fput(sock->file);
>  err_vq:
>       mutex_unlock(&vq->mutex);
>  err:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ea966b3..2ebf6fc 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -179,6 +179,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>       vq->call_ctx = NULL;
>       vq->call = NULL;
>       vq->log_ctx = NULL;
> +     vq->upend_idx = 0;
> +     vq->done_idx = 0;
> +     vq->ubufs = NULL;
>  }
>  
>  static int vhost_worker(void *data)
> @@ -237,6 +240,8 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>                                         GFP_KERNEL);
>               dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
>                                           UIO_MAXIOV, GFP_KERNEL);
> +             dev->vqs[i].ubuf_info = kmalloc(sizeof *dev->vqs[i].ubuf_info *
> +                                         UIO_MAXIOV, GFP_KERNEL);
>  
>               if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
>                       !dev->vqs[i].heads)
> @@ -249,6 +254,7 @@ err_nomem:
>               kfree(dev->vqs[i].indirect);
>               kfree(dev->vqs[i].log);
>               kfree(dev->vqs[i].heads);
> +             kfree(dev->vqs[i].ubuf_info);
>       }
>       return -ENOMEM;
>  }
> @@ -390,6 +396,29 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>       return 0;
>  }
>  
> +/* In case of DMA done not in order in lower device driver for some reason.
> + * upend_idx is used to track end of used idx, done_idx is used to track head
> + * of used idx. Once lower device DMA done contiguously, we will signal KVM
> + * guest used idx.
> + */
> +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
> +{
> +     int i, j = 0;

On two lines?
        int i;
        int j = 0;
as per CodingStyle..

> +
> +     for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
> +             if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
> +                     vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
> +                     vhost_add_used_and_signal(vq->dev, vq,
> +                                               vq->heads[i].id, 0);
> +                     ++j;
> +             } else
> +                     break;
> +     }
> +     if (j)
> +             vq->done_idx = i;
> +     return j;
> +}
> +
>  /* Caller should have device mutex */
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
> @@ -400,6 +429,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>                       vhost_poll_stop(&dev->vqs[i].poll);
>                       vhost_poll_flush(&dev->vqs[i].poll);
>               }
> +             /* Wait for all lower device DMAs done. */
> +             if (dev->vqs[i].ubufs)
> +                     vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
> +
> +             /* Signal guest as appropriate. */
> +             vhost_zerocopy_signal_used(&dev->vqs[i]);
> +
>               if (dev->vqs[i].error_ctx)
>                       eventfd_ctx_put(dev->vqs[i].error_ctx);
>               if (dev->vqs[i].error)
> @@ -1486,3 +1522,52 @@ void vhost_disable_notify(struct vhost_dev *dev, 
> struct vhost_virtqueue *vq)
>                              &vq->used->flags, r);
>       }
>  }
> +
> +static void vhost_zerocopy_done_signal(struct kref *kref)
> +{
> +     struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
> +                                                 kref);
> +     wake_up(&ubufs->wait);
> +}
> +
> +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
> +                                     bool zcopy)
> +{
> +     struct vhost_ubuf_ref *ubufs;
> +     /* No zero copy backend? Nothing to count. */
> +     if (!zcopy)
> +             return NULL;
> +     ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
> +     if (!ubufs)
> +             return ERR_PTR(-ENOMEM);
> +     kref_init(&ubufs->kref);
> +     kref_get(&ubufs->kref);
> +     init_waitqueue_head(&ubufs->wait);
> +     ubufs->vq = vq;
> +     return ubufs;
> +}
> +
> +void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
> +{
> +     kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
> +}
> +
> +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
> +{
> +     kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
> +     wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
> +     kfree(ubufs);
> +}
> +
> +void vhost_zerocopy_callback(void *arg)
> +{
> +     struct ubuf_info *ubuf = (struct ubuf_info *)arg;

No need to explicitly cast a void*, just do:

     struct ubuf_info *ubuf = arg;


> +     struct vhost_ubuf_ref *ubufs;
> +     struct vhost_virtqueue *vq;
> +
> +     ubufs = ubuf->arg;
> +     vq = ubufs->vq;

Why not shorten this a bit as:

     struct vhost_ubuf_ref *ubufs = ubuf->arg;
     struct vhost_virtqueue *vq = ubufs->vq;

?

> +     /* set len = 1 to mark this desc buffers done DMA */
> +     vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> +     kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
> +}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 8e03379..e287145 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -13,6 +13,11 @@
>  #include <linux/virtio_ring.h>
>  #include <asm/atomic.h>
>  
> +/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
> + * done */
> +#define VHOST_DMA_DONE_LEN   1
> +#define VHOST_DMA_CLEAR_LEN  0
> +
>  struct vhost_device;
>  
>  struct vhost_work;
> @@ -50,6 +55,18 @@ struct vhost_log {
>       u64 len;
>  };
>  
> +struct vhost_virtqueue;
> +
> +struct vhost_ubuf_ref {
> +     struct kref kref;
> +     wait_queue_t wait;
> +     struct vhost_virtqueue *vq;
> +};
> +
> +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool 
> zcopy);
> +void vhost_ubuf_put(struct vhost_ubuf_ref *);
> +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>       struct vhost_dev *dev;
> @@ -114,6 +131,16 @@ struct vhost_virtqueue {
>       /* Log write descriptors */
>       void __user *log_base;
>       struct vhost_log *log;
> +     /* vhost zerocopy support fields below: */
> +     /* last used idx for outstanding DMA zerocopy buffers */
> +     int upend_idx;
> +     /* first used idx for DMA done zerocopy buffers */
> +     int done_idx;
> +     /* an array of userspace buffers info */
> +     struct ubuf_info *ubuf_info;
> +     /* Reference counting for outstanding ubufs.
> +      * Protected by vq mutex. Writers must also take device mutex. */
> +     struct vhost_ubuf_ref *ubufs;
>  };
>  
>  struct vhost_dev {
> @@ -160,6 +187,8 @@ bool vhost_enable_notify(struct vhost_dev *, struct 
> vhost_virtqueue *);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>                   unsigned int log_num, u64 len);
> +void vhost_zerocopy_callback(void *arg);
> +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>               pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> 

-- 
Jesper Juhl <[email protected]>       http://www.chaosbits.net/
Don't top-post http://www.catb.org/jargon/html/T/top-post.html
Plain text mails only, please.

_______________________________________________
Virtualization mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/virtualization

Reply via email to