Bandan Das <b...@redhat.com> writes:

> "Michael S. Tsirkin" <m...@redhat.com> writes:
>
>> On Mon, Jul 13, 2015 at 12:07:32AM -0400, Bandan Das wrote:
>>> vhost threads are per-device, but in most cases a single thread
>>> is enough. This change creates a single thread that is used to
>>> serve all guests.
>>> 
>>> However, this complicates cgroups associations. The current policy
>>> is to attach the per-device thread to all cgroups of the parent process
>>> that the device is associated it. This is no longer possible if we
>>> have a single thread. So, we end up moving the thread around to
>>> cgroups of whichever device that needs servicing. This is a very
>>> inefficient protocol but seems to be the only way to integrate
>>> cgroups support.
>>> 
>>> Signed-off-by: Razya Ladelsky <ra...@il.ibm.com>
>>> Signed-off-by: Bandan Das <b...@redhat.com>
>>
>> BTW, how does this interact with virtio net MQ?
>> It would seem that MQ gains from more parallelism and
>> CPU locality.
>
> Hm.. Good point. As of this version, this design will always have
> one worker thread servicing a guest. Now suppose we have 10 virtio
> queues for a guest, surely, we could benefit from spawning off another
> worker just like we are doing in case of a new guest/device with
> the devs_per_worker parameter.

So, I did a quick smoke test with virtio-net and the Elvis patches.
virtio net MQ already spawns a new worker thread for every queue,
it seems ? So, the above setup already works! :) I will run some tests and
post back the results.

>>> ---
>>>  drivers/vhost/scsi.c  |  15 +++--
>>>  drivers/vhost/vhost.c | 150 
>>> ++++++++++++++++++++++++--------------------------
>>>  drivers/vhost/vhost.h |  19 +++++--
>>>  3 files changed, 97 insertions(+), 87 deletions(-)
>>> 
>>> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
>>> index ea32b38..6c42936 100644
>>> --- a/drivers/vhost/scsi.c
>>> +++ b/drivers/vhost/scsi.c
>>> @@ -535,7 +535,7 @@ static void vhost_scsi_complete_cmd(struct 
>>> vhost_scsi_cmd *cmd)
>>>  
>>>     llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list);
>>>  
>>> -   vhost_work_queue(&vs->dev, &vs->vs_completion_work);
>>> +   vhost_work_queue(vs->dev.worker, &vs->vs_completion_work);
>>>  }
>>>  
>>>  static int vhost_scsi_queue_data_in(struct se_cmd *se_cmd)
>>> @@ -1282,7 +1282,7 @@ vhost_scsi_send_evt(struct vhost_scsi *vs,
>>>     }
>>>  
>>>     llist_add(&evt->list, &vs->vs_event_list);
>>> -   vhost_work_queue(&vs->dev, &vs->vs_event_work);
>>> +   vhost_work_queue(vs->dev.worker, &vs->vs_event_work);
>>>  }
>>>  
>>>  static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
>>> @@ -1335,8 +1335,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
>>>     /* Flush both the vhost poll and vhost work */
>>>     for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
>>>             vhost_scsi_flush_vq(vs, i);
>>> -   vhost_work_flush(&vs->dev, &vs->vs_completion_work);
>>> -   vhost_work_flush(&vs->dev, &vs->vs_event_work);
>>> +   vhost_work_flush(vs->dev.worker, &vs->vs_completion_work);
>>> +   vhost_work_flush(vs->dev.worker, &vs->vs_event_work);
>>>  
>>>     /* Wait for all reqs issued before the flush to be finished */
>>>     for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
>>> @@ -1584,8 +1584,11 @@ static int vhost_scsi_open(struct inode *inode, 
>>> struct file *f)
>>>     if (!vqs)
>>>             goto err_vqs;
>>>  
>>> -   vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work);
>>> -   vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work);
>>> +   vhost_work_init(&vs->dev, &vs->vs_completion_work,
>>> +                   vhost_scsi_complete_cmd_work);
>>> +
>>> +   vhost_work_init(&vs->dev, &vs->vs_event_work,
>>> +                   vhost_scsi_evt_work);
>>>  
>>>     vs->vs_events_nr = 0;
>>>     vs->vs_events_missed = false;
>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>> index 2ee2826..951c96b 100644
>>> --- a/drivers/vhost/vhost.c
>>> +++ b/drivers/vhost/vhost.c
>>> @@ -11,6 +11,8 @@
>>>   * Generic code for virtio server in host kernel.
>>>   */
>>>  
>>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>> +
>>>  #include <linux/eventfd.h>
>>>  #include <linux/vhost.h>
>>>  #include <linux/uio.h>
>>> @@ -28,6 +30,9 @@
>>>  
>>>  #include "vhost.h"
>>>  
>>> +/* Just one worker thread to service all devices */
>>> +static struct vhost_worker *worker;
>>> +
>>>  enum {
>>>     VHOST_MEMORY_MAX_NREGIONS = 64,
>>>     VHOST_MEMORY_F_LOG = 0x1,
>>> @@ -58,13 +63,15 @@ static int vhost_poll_wakeup(wait_queue_t *wait, 
>>> unsigned mode, int sync,
>>>     return 0;
>>>  }
>>>  
>>> -void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
>>> +void vhost_work_init(struct vhost_dev *dev,
>>> +                struct vhost_work *work, vhost_work_fn_t fn)
>>>  {
>>>     INIT_LIST_HEAD(&work->node);
>>>     work->fn = fn;
>>>     init_waitqueue_head(&work->done);
>>>     work->flushing = 0;
>>>     work->queue_seq = work->done_seq = 0;
>>> +   work->dev = dev;
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_work_init);
>>>  
>>> @@ -78,7 +85,7 @@ void vhost_poll_init(struct vhost_poll *poll, 
>>> vhost_work_fn_t fn,
>>>     poll->dev = dev;
>>>     poll->wqh = NULL;
>>>  
>>> -   vhost_work_init(&poll->work, fn);
>>> +   vhost_work_init(dev, &poll->work, fn);
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_poll_init);
>>>  
>>> @@ -116,30 +123,30 @@ void vhost_poll_stop(struct vhost_poll *poll)
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_poll_stop);
>>>  
>>> -static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work 
>>> *work,
>>> -                           unsigned seq)
>>> +static bool vhost_work_seq_done(struct vhost_worker *worker,
>>> +                           struct vhost_work *work, unsigned seq)
>>>  {
>>>     int left;
>>>  
>>> -   spin_lock_irq(&dev->work_lock);
>>> +   spin_lock_irq(&worker->work_lock);
>>>     left = seq - work->done_seq;
>>> -   spin_unlock_irq(&dev->work_lock);
>>> +   spin_unlock_irq(&worker->work_lock);
>>>     return left <= 0;
>>>  }
>>>  
>>> -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
>>> +void vhost_work_flush(struct vhost_worker *worker, struct vhost_work *work)
>>>  {
>>>     unsigned seq;
>>>     int flushing;
>>>  
>>> -   spin_lock_irq(&dev->work_lock);
>>> +   spin_lock_irq(&worker->work_lock);
>>>     seq = work->queue_seq;
>>>     work->flushing++;
>>> -   spin_unlock_irq(&dev->work_lock);
>>> -   wait_event(work->done, vhost_work_seq_done(dev, work, seq));
>>> -   spin_lock_irq(&dev->work_lock);
>>> +   spin_unlock_irq(&worker->work_lock);
>>> +   wait_event(work->done, vhost_work_seq_done(worker, work, seq));
>>> +   spin_lock_irq(&worker->work_lock);
>>>     flushing = --work->flushing;
>>> -   spin_unlock_irq(&dev->work_lock);
>>> +   spin_unlock_irq(&worker->work_lock);
>>>     BUG_ON(flushing < 0);
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_work_flush);
>>> @@ -148,29 +155,30 @@ EXPORT_SYMBOL_GPL(vhost_work_flush);
>>>   * locks that are also used by the callback. */
>>>  void vhost_poll_flush(struct vhost_poll *poll)
>>>  {
>>> -   vhost_work_flush(poll->dev, &poll->work);
>>> +   vhost_work_flush(poll->dev->worker, &poll->work);
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_poll_flush);
>>>  
>>> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
>>> +void vhost_work_queue(struct vhost_worker *worker,
>>> +                 struct vhost_work *work)
>>>  {
>>>     unsigned long flags;
>>>  
>>> -   spin_lock_irqsave(&dev->work_lock, flags);
>>> +   spin_lock_irqsave(&worker->work_lock, flags);
>>>     if (list_empty(&work->node)) {
>>> -           list_add_tail(&work->node, &dev->work_list);
>>> +           list_add_tail(&work->node, &worker->work_list);
>>>             work->queue_seq++;
>>> -           spin_unlock_irqrestore(&dev->work_lock, flags);
>>> -           wake_up_process(dev->worker);
>>> +           spin_unlock_irqrestore(&worker->work_lock, flags);
>>> +           wake_up_process(worker->thread);
>>>     } else {
>>> -           spin_unlock_irqrestore(&dev->work_lock, flags);
>>> +           spin_unlock_irqrestore(&worker->work_lock, flags);
>>>     }
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_work_queue);
>>>  
>>>  void vhost_poll_queue(struct vhost_poll *poll)
>>>  {
>>> -   vhost_work_queue(poll->dev, &poll->work);
>>> +   vhost_work_queue(poll->dev->worker, &poll->work);
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_poll_queue);
>>>  
>>> @@ -203,19 +211,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>>>  
>>>  static int vhost_worker(void *data)
>>>  {
>>> -   struct vhost_dev *dev = data;
>>> +   struct vhost_worker *worker = data;
>>>     struct vhost_work *work = NULL;
>>>     unsigned uninitialized_var(seq);
>>>     mm_segment_t oldfs = get_fs();
>>>  
>>>     set_fs(USER_DS);
>>> -   use_mm(dev->mm);
>>>  
>>>     for (;;) {
>>>             /* mb paired w/ kthread_stop */
>>>             set_current_state(TASK_INTERRUPTIBLE);
>>>  
>>> -           spin_lock_irq(&dev->work_lock);
>>> +           spin_lock_irq(&worker->work_lock);
>>>             if (work) {
>>>                     work->done_seq = seq;
>>>                     if (work->flushing)
>>> @@ -223,21 +230,35 @@ static int vhost_worker(void *data)
>>>             }
>>>  
>>>             if (kthread_should_stop()) {
>>> -                   spin_unlock_irq(&dev->work_lock);
>>> +                   spin_unlock_irq(&worker->work_lock);
>>>                     __set_current_state(TASK_RUNNING);
>>>                     break;
>>>             }
>>> -           if (!list_empty(&dev->work_list)) {
>>> -                   work = list_first_entry(&dev->work_list,
>>> +           if (!list_empty(&worker->work_list)) {
>>> +                   work = list_first_entry(&worker->work_list,
>>>                                             struct vhost_work, node);
>>>                     list_del_init(&work->node);
>>>                     seq = work->queue_seq;
>>>             } else
>>>                     work = NULL;
>>> -           spin_unlock_irq(&dev->work_lock);
>>> +           spin_unlock_irq(&worker->work_lock);
>>>  
>>>             if (work) {
>>> +                   struct vhost_dev *dev = work->dev;
>>> +
>>>                     __set_current_state(TASK_RUNNING);
>>> +
>>> +                   if (current->mm != dev->mm) {
>>> +                           unuse_mm(current->mm);
>>> +                           use_mm(dev->mm);
>>> +                   }
>>> +
>>> +                   /* TODO: Consider a more elegant solution */
>>> +                   if (worker->owner != dev->owner) {
>>> +                           /* Should check for return value */
>>> +                           cgroup_attach_task_all(dev->owner, current);
>>> +                           worker->owner = dev->owner;
>>> +                   }
>>>                     work->fn(work);
>>>                     if (need_resched())
>>>                             schedule();
>>> @@ -245,7 +266,6 @@ static int vhost_worker(void *data)
>>>                     schedule();
>>>  
>>>     }
>>> -   unuse_mm(dev->mm);
>>>     set_fs(oldfs);
>>>     return 0;
>>>  }
>>> @@ -304,9 +324,8 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>     dev->log_file = NULL;
>>>     dev->memory = NULL;
>>>     dev->mm = NULL;
>>> -   spin_lock_init(&dev->work_lock);
>>> -   INIT_LIST_HEAD(&dev->work_list);
>>> -   dev->worker = NULL;
>>> +   dev->worker = worker;
>>> +   dev->owner = current;
>>>  
>>>     for (i = 0; i < dev->nvqs; ++i) {
>>>             vq = dev->vqs[i];
>>> @@ -331,31 +350,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
>>>  }
>>>  EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
>>>  
>>> -struct vhost_attach_cgroups_struct {
>>> -   struct vhost_work work;
>>> -   struct task_struct *owner;
>>> -   int ret;
>>> -};
>>> -
>>> -static void vhost_attach_cgroups_work(struct vhost_work *work)
>>> -{
>>> -   struct vhost_attach_cgroups_struct *s;
>>> -
>>> -   s = container_of(work, struct vhost_attach_cgroups_struct, work);
>>> -   s->ret = cgroup_attach_task_all(s->owner, current);
>>> -}
>>> -
>>> -static int vhost_attach_cgroups(struct vhost_dev *dev)
>>> -{
>>> -   struct vhost_attach_cgroups_struct attach;
>>> -
>>> -   attach.owner = current;
>>> -   vhost_work_init(&attach.work, vhost_attach_cgroups_work);
>>> -   vhost_work_queue(dev, &attach.work);
>>> -   vhost_work_flush(dev, &attach.work);
>>> -   return attach.ret;
>>> -}
>>> -
>>>  /* Caller should have device mutex */
>>>  bool vhost_dev_has_owner(struct vhost_dev *dev)
>>>  {
>>> @@ -366,7 +360,6 @@ EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
>>>  /* Caller should have device mutex */
>>>  long vhost_dev_set_owner(struct vhost_dev *dev)
>>>  {
>>> -   struct task_struct *worker;
>>>     int err;
>>>  
>>>     /* Is there an owner already? */
>>> @@ -377,28 +370,15 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>>>  
>>>     /* No owner, become one */
>>>     dev->mm = get_task_mm(current);
>>> -   worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
>>> -   if (IS_ERR(worker)) {
>>> -           err = PTR_ERR(worker);
>>> -           goto err_worker;
>>> -   }
>>> -
>>>     dev->worker = worker;
>>> -   wake_up_process(worker);        /* avoid contributing to loadavg */
>>> -
>>> -   err = vhost_attach_cgroups(dev);
>>> -   if (err)
>>> -           goto err_cgroup;
>>>  
>>>     err = vhost_dev_alloc_iovecs(dev);
>>>     if (err)
>>> -           goto err_cgroup;
>>> +           goto err_alloc;
>>>  
>>>     return 0;
>>> -err_cgroup:
>>> -   kthread_stop(worker);
>>> +err_alloc:
>>>     dev->worker = NULL;
>>> -err_worker:
>>>     if (dev->mm)
>>>             mmput(dev->mm);
>>>     dev->mm = NULL;
>>> @@ -472,11 +452,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool 
>>> locked)
>>>     /* No one will access memory at this point */
>>>     kfree(dev->memory);
>>>     dev->memory = NULL;
>>> -   WARN_ON(!list_empty(&dev->work_list));
>>> -   if (dev->worker) {
>>> -           kthread_stop(dev->worker);
>>> -           dev->worker = NULL;
>>> -   }
>>>     if (dev->mm)
>>>             mmput(dev->mm);
>>>     dev->mm = NULL;
>>> @@ -1567,11 +1542,32 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
>>>  
>>>  static int __init vhost_init(void)
>>>  {
>>> +   struct vhost_worker *w =
>>> +           kzalloc(sizeof(*w), GFP_KERNEL);
>>> +   if (!w)
>>> +           return -ENOMEM;
>>> +
>>> +   w->thread = kthread_create(vhost_worker,
>>> +                              w, "vhost-worker");
>>> +   if (IS_ERR(w->thread))
>>> +           return PTR_ERR(w->thread);
>>> +
>>> +   worker = w;
>>> +   spin_lock_init(&worker->work_lock);
>>> +   INIT_LIST_HEAD(&worker->work_list);
>>> +   wake_up_process(worker->thread);
>>> +   pr_info("Created universal thread to service requests\n");
>>> +
>>>     return 0;
>>>  }
>>>  
>>>  static void __exit vhost_exit(void)
>>>  {
>>> +   if (worker) {
>>> +           kthread_stop(worker->thread);
>>> +           WARN_ON(!list_empty(&worker->work_list));
>>> +           kfree(worker);
>>> +   }
>>>  }
>>>  
>>>  module_init(vhost_init);
>>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>>> index 8c1c792..2f204ce 100644
>>> --- a/drivers/vhost/vhost.h
>>> +++ b/drivers/vhost/vhost.h
>>> @@ -22,6 +22,7 @@ struct vhost_work {
>>>     int                       flushing;
>>>     unsigned                  queue_seq;
>>>     unsigned                  done_seq;
>>> +   struct vhost_dev          *dev;
>>>  };
>>>  
>>>  /* Poll a file (eventfd or socket) */
>>> @@ -35,8 +36,8 @@ struct vhost_poll {
>>>     struct vhost_dev         *dev;
>>>  };
>>>  
>>> -void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
>>> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
>>> +void vhost_work_init(struct vhost_dev *dev,
>>> +                struct vhost_work *work, vhost_work_fn_t fn);
>>>  
>>>  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
>>>                  unsigned long mask, struct vhost_dev *dev);
>>> @@ -44,7 +45,6 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
>>> *file);
>>>  void vhost_poll_stop(struct vhost_poll *poll);
>>>  void vhost_poll_flush(struct vhost_poll *poll);
>>>  void vhost_poll_queue(struct vhost_poll *poll);
>>> -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
>>>  long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
>>>  
>>>  struct vhost_log {
>>> @@ -116,11 +116,22 @@ struct vhost_dev {
>>>     int nvqs;
>>>     struct file *log_file;
>>>     struct eventfd_ctx *log_ctx;
>>> +   /* vhost shared worker */
>>> +   struct vhost_worker *worker;
>>> +   /* for cgroup support */
>>> +   struct task_struct *owner;
>>> +};
>>> +
>>> +struct vhost_worker {
>>>     spinlock_t work_lock;
>>>     struct list_head work_list;
>>> -   struct task_struct *worker;
>>> +   struct task_struct *thread;
>>> +   struct task_struct *owner;
>>>  };
>>>  
>>> +void vhost_work_queue(struct vhost_worker *worker,
>>> +                 struct vhost_work *work);
>>> +void vhost_work_flush(struct vhost_worker *worker, struct vhost_work 
>>> *work);
>>>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int 
>>> nvqs);
>>>  long vhost_dev_set_owner(struct vhost_dev *dev);
>>>  bool vhost_dev_has_owner(struct vhost_dev *dev);
>>> -- 
>>> 2.4.3
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to