Re: [RFC PATCH v2] blk-mq: Introduce per sw queue time-slice

Andreas Herrmann Tue, 09 Feb 2016 09:14:11 -0800

[CC-ing linux-block and linux-scsi and adding some comments]

On Mon, Feb 01, 2016 at 11:43:40PM +0100, Andreas Herrmann wrote:
> This introduces a new blk_mq hw attribute time_slice_us which allows
> to specify a time slice in usecs.
> 
> Default value is 0 and implies no modification to blk-mq behaviour.
> 
> A positive value changes blk-mq to service only one software queue
> within this time slice until it expires or the software queue is
> empty. Then the next software queue with pending requests is selected.
> 
> Signed-off-by: Andreas Herrmann <aherrm...@suse.com>
> ---
>  block/blk-mq-sysfs.c   |  27 +++++++
>  block/blk-mq.c         | 208 
> +++++++++++++++++++++++++++++++++++++++++--------
>  include/linux/blk-mq.h |   9 +++
>  3 files changed, 211 insertions(+), 33 deletions(-)
> 
> Hi,
> 
> This update is long overdue (sorry for the delay).
> 
> Change to v1:
> - time slice is now specified in usecs instead of msecs.
> - time slice is extended (up to 3 times the initial value) when there
>   was actually a request to be serviced for the software queue
> 
> Fio test results are sent in a separate mail to this.


See http://marc.info/?l=linux-kernel&m=145436682607949&w=2

In short it shows significant performance gains in some tests,
e.g. sequential read iops up by >40% with 8 jobs. But it's never on
par with CFQ when more than 1 job was used during the test.

> Results for fio improved to some extent with this patch. But in
> reality the picture is quite mixed. Performance is highly dependend on
> task scheduling. There is no guarantee that the requests originated
> from one CPU belong to the same process.
> 
> I think for rotary devices CFQ is by far the best choice. A simple
> illustration is:
> 
>   Copying two files (750MB in this case) in parallel on a rotary
>   device. The elapsed wall clock time (seconds) for this is
>                                mean    stdev
>    cfq, slice_idle=8           16.18   4.95
>    cfq, slice_idle=0           23.74   2.82
>    blk-mq, time_slice_usec=0   24.37   2.05
>    blk-mq, time_slice_usec=250 25.58   3.16

This illustrates that although their was performance gain with fio
tests, the patch can cause higher variance and lower performance in
comparison to unmodified blk-mq with other tests. And it underscores
superiority of CFQ for rotary disks.

Meanwhile my opinion is that it's not really worth to look further
into introduction of I/O scheduling support in blk-mq. I don't see the
need for scheduling support (deadline or something else) for fast
storage devices. And rotary devices should really avoid usage of blk-mq
and stick to CFQ.

Thus I think that introducing some coexistence of blk-mq and the
legacy block with CFQ is the best option.

Recently Johannes sent a patch to enable scsi-mq per driver, see
http://marc.info/?l=linux-scsi&m=145347009631192&w=2

Probably that is a good solution (at least in the short term) to allow
users to switch to blk-mq for some host adapters (with fast storage
attached) but to stick to legacy stuff on other host adapters with
rotary devices.

What do others think?


Thanks,

Andreas


> Regards,
> 
> Andreas
> 
> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> index 1cf1878..77c875c 100644
> --- a/block/blk-mq-sysfs.c
> +++ b/block/blk-mq-sysfs.c
> @@ -247,6 +247,26 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct 
> blk_mq_hw_ctx *hctx, char *page)
>       return ret;
>  }
>  
> +static ssize_t blk_mq_hw_sysfs_tslice_show(struct blk_mq_hw_ctx *hctx,
> +                                       char *page)
> +{
> +     return sprintf(page, "%u\n", hctx->tslice_us);
> +}
> +
> +static ssize_t blk_mq_hw_sysfs_tslice_store(struct blk_mq_hw_ctx *hctx,
> +                                     const char *page, size_t length)
> +{
> +     unsigned long long store;
> +     int err;
> +
> +     err = kstrtoull(page, 10, &store);
> +     if (err)
> +             return -EINVAL;
> +
> +     hctx->tslice_us = (unsigned)store;
> +     return length;
> +}
> +
>  static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
>       .attr = {.name = "dispatched", .mode = S_IRUGO },
>       .show = blk_mq_sysfs_dispatched_show,
> @@ -305,6 +325,12 @@ static struct blk_mq_hw_ctx_sysfs_entry 
> blk_mq_hw_sysfs_poll = {
>       .show = blk_mq_hw_sysfs_poll_show,
>  };
>  
> +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tslice = {
> +     .attr = {.name = "time_slice_us", .mode = S_IRUGO | S_IWUSR },
> +     .show = blk_mq_hw_sysfs_tslice_show,
> +     .store = blk_mq_hw_sysfs_tslice_store,
> +};
> +
>  static struct attribute *default_hw_ctx_attrs[] = {
>       &blk_mq_hw_sysfs_queued.attr,
>       &blk_mq_hw_sysfs_run.attr,
> @@ -314,6 +340,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
>       &blk_mq_hw_sysfs_cpus.attr,
>       &blk_mq_hw_sysfs_active.attr,
>       &blk_mq_hw_sysfs_poll.attr,
> +     &blk_mq_hw_sysfs_tslice.attr,
>       NULL,
>  };
>  
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4c0622f..97d32f2 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -68,6 +68,7 @@ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx 
> *hctx,
>  
>       if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
>               set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
> +     cpumask_set_cpu(ctx->cpu, hctx->cpu_pending_mask);
>  }
>  
>  static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
> @@ -76,6 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx 
> *hctx,
>       struct blk_align_bitmap *bm = get_bm(hctx, ctx);
>  
>       clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
> +     cpumask_clear_cpu(ctx->cpu, hctx->cpu_pending_mask);
>  }
>  
>  void blk_mq_freeze_queue_start(struct request_queue *q)
> @@ -682,15 +684,41 @@ static bool blk_mq_attempt_merge(struct request_queue 
> *q,
>       return false;
>  }
>  
> +static int tslice_flush_one_ctx(struct blk_mq_hw_ctx *hctx,
> +                             struct list_head *list, int cpu)
> +{
> +     struct request_queue *q = hctx->queue;
> +     struct blk_mq_ctx *ctx;
> +
> +     if (cpu == -1 || !hctx->tslice_us)
> +             return 0;
> +
> +     ctx = __blk_mq_get_ctx(q, cpu);
> +     spin_lock(&ctx->lock);
> +     if (!list_empty(&ctx->rq_list)) {
> +             list_splice_tail_init(&ctx->rq_list, list);
> +             spin_lock(&hctx->lock);
> +             hctx->tslice_inc = 1;
> +             blk_mq_hctx_clear_pending(hctx, ctx);
> +             spin_unlock(&hctx->lock);
> +     }
> +     spin_unlock(&ctx->lock);
> +     return 1;
> +}
> +
>  /*
>   * Process software queues that have been marked busy, splicing them
>   * to the for-dispatch
>   */
> -static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head 
> *list)
> +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx,
> +                     struct list_head *list, int cpu)
>  {
>       struct blk_mq_ctx *ctx;
>       int i;
>  
> +     if (tslice_flush_one_ctx(hctx, list, cpu))
> +             return;
> +
>       for (i = 0; i < hctx->ctx_map.size; i++) {
>               struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
>               unsigned int off, bit;
> @@ -706,9 +734,11 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, 
> struct list_head *list)
>                               break;
>  
>                       ctx = hctx->ctxs[bit + off];
> -                     clear_bit(bit, &bm->word);
>                       spin_lock(&ctx->lock);
>                       list_splice_tail_init(&ctx->rq_list, list);
> +                     spin_lock(&hctx->lock);
> +                     blk_mq_hctx_clear_pending(hctx, ctx);
> +                     spin_unlock(&hctx->lock);
>                       spin_unlock(&ctx->lock);
>  
>                       bit++;
> @@ -717,6 +747,114 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, 
> struct list_head *list)
>  }
>  
>  /*
> + * It'd be great if the workqueue API had a way to pass
> + * in a mask and had some smarts for more clever placement.
> + * For now we just round-robin here, switching for every
> + * BLK_MQ_CPU_WORK_BATCH queued items.
> + */
> +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> +{
> +     if (hctx->queue->nr_hw_queues == 1)
> +             return WORK_CPU_UNBOUND;
> +
> +     if (--hctx->next_cpu_batch <= 0) {
> +             int cpu = hctx->next_cpu, next_cpu;
> +
> +             next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> +             if (next_cpu >= nr_cpu_ids)
> +                     next_cpu = cpumask_first(hctx->cpumask);
> +
> +             hctx->next_cpu = next_cpu;
> +             hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
> +
> +             return cpu;
> +     }
> +
> +     return hctx->next_cpu;
> +}
> +
> +static int blk_mq_tslice_expired(struct blk_mq_hw_ctx *hctx)
> +{
> +     if (time_after_eq(jiffies, hctx->tslice_expiration))
> +             return 1;
> +
> +     return 0;
> +}
> +
> +static int blk_mq_tslice_next_cpu(struct blk_mq_hw_ctx *hctx)
> +{
> +     int c = hctx->tslice_cpu;
> +
> +     /* allow extension of time slice */
> +     if (hctx->tslice_inc && hctx->tslice_inc_count < 4) {
> +             hctx->tslice_inc = 0;
> +             hctx->tslice_inc_count++;
> +             goto out;
> +     }
> +     hctx->tslice_inc = 0;
> +     hctx->tslice_inc_count = 0;
> +
> +     /* clear CPU for which tslice has expired */
> +     if (c != -1)
> +             cpumask_clear_cpu(c, hctx->cpu_service_mask);
> +
> +     /* calculate mask of remaining CPUs with pending work */
> +     if (cpumask_and(hctx->cpu_next_mask, hctx->cpu_pending_mask,
> +                             hctx->cpu_service_mask)) {
> +             c = cpumask_first(hctx->cpu_next_mask);
> +     } else {
> +             /*
> +              * no remaining CPUs with pending work, reset epoch,
> +              * start with first CPU that has requests pending
> +              */
> +             hctx->tslice_expiration = 0;
> +             cpumask_setall(hctx->cpu_service_mask);
> +             c = cpumask_first(hctx->cpu_pending_mask);
> +     }
> +
> +     /* no CPU with pending requests */
> +     if (c >= nr_cpu_ids)
> +             return -1;
> +
> +out:
> +     hctx->tslice_expiration = jiffies + usecs_to_jiffies(hctx->tslice_us);
> +     return c;
> +}
> +
> +static int tslice_get_busy_ctx(struct blk_mq_hw_ctx *hctx)
> +{
> +     int cpu = -1;
> +
> +     if (hctx->tslice_us) {
> +             spin_lock(&hctx->lock);
> +             if (blk_mq_tslice_expired(hctx))
> +                     hctx->tslice_cpu = blk_mq_tslice_next_cpu(hctx);
> +             cpu = hctx->tslice_cpu;
> +             spin_unlock(&hctx->lock);
> +     }
> +
> +     return cpu;
> +}
> +
> +/*
> + * Ensure that hardware queue is run if there are pending
> + * requests in any software queue.
> + */
> +static void tslice_schedule_pending_work(struct blk_mq_hw_ctx *hctx)
> +{
> +     long t;
> +
> +     if (!hctx->tslice_us || cpumask_empty(hctx->cpu_pending_mask))
> +             return;
> +
> +     t = hctx->tslice_expiration - jiffies;
> +     if (t < 0)
> +             t = 0;
> +     kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> +                                     &hctx->run_work, (unsigned int)t);
> +}
> +
> +/*
>   * Run this hardware queue, pulling any software queues mapped to it in.
>   * Note that this function currently has various problems around ordering
>   * of IO. In particular, we'd like FIFO behaviour on handling existing
> @@ -729,7 +867,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
> *hctx)
>       LIST_HEAD(rq_list);
>       LIST_HEAD(driver_list);
>       struct list_head *dptr;
> -     int queued;
> +     int queued, cpu;
>  
>       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
>  
> @@ -739,9 +877,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
> *hctx)
>       hctx->run++;
>  
>       /*
> -      * Touch any software queue that has pending entries.
> +      * Touch dedicated software queue if time slice is set or any
> +      * software queue that has pending entries (cpu == -1).
>        */
> -     flush_busy_ctxs(hctx, &rq_list);
> +     cpu = tslice_get_busy_ctx(hctx);
> +     flush_busy_ctxs(hctx, &rq_list, cpu);
>  
>       /*
>        * If we have previous entries on our dispatch list, grab them
> @@ -826,34 +966,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
> *hctx)
>                * blk_mq_run_hw_queue() already checks the STOPPED bit
>                **/
>               blk_mq_run_hw_queue(hctx, true);
> -     }
> -}
> -
> -/*
> - * It'd be great if the workqueue API had a way to pass
> - * in a mask and had some smarts for more clever placement.
> - * For now we just round-robin here, switching for every
> - * BLK_MQ_CPU_WORK_BATCH queued items.
> - */
> -static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> -{
> -     if (hctx->queue->nr_hw_queues == 1)
> -             return WORK_CPU_UNBOUND;
> -
> -     if (--hctx->next_cpu_batch <= 0) {
> -             int cpu = hctx->next_cpu, next_cpu;
> -
> -             next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> -             if (next_cpu >= nr_cpu_ids)
> -                     next_cpu = cpumask_first(hctx->cpumask);
> -
> -             hctx->next_cpu = next_cpu;
> -             hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
> -
> -             return cpu;
> -     }
> -
> -     return hctx->next_cpu;
> +     } else
> +             tslice_schedule_pending_work(hctx);
>  }
>  
>  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> @@ -992,7 +1106,10 @@ static void __blk_mq_insert_request(struct 
> blk_mq_hw_ctx *hctx,
>       struct blk_mq_ctx *ctx = rq->mq_ctx;
>  
>       __blk_mq_insert_req_list(hctx, ctx, rq, at_head);
> +     spin_lock(&hctx->lock);
>       blk_mq_hctx_mark_pending(hctx, ctx);
> +     spin_unlock(&hctx->lock);
> +
>  }
>  
>  void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
> @@ -1583,7 +1700,9 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx 
> *hctx, int cpu)
>       spin_lock(&ctx->lock);
>       if (!list_empty(&ctx->rq_list)) {
>               list_splice_init(&ctx->rq_list, &tmp);
> +             spin_lock(&hctx->lock);
>               blk_mq_hctx_clear_pending(hctx, ctx);
> +             spin_unlock(&hctx->lock);
>       }
>       spin_unlock(&ctx->lock);
>  
> @@ -1602,7 +1721,9 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx 
> *hctx, int cpu)
>       }
>  
>       hctx = q->mq_ops->map_queue(q, ctx->cpu);
> +     spin_lock(&hctx->lock);
>       blk_mq_hctx_mark_pending(hctx, ctx);
> +     spin_unlock(&hctx->lock);
>  
>       spin_unlock(&ctx->lock);
>  
> @@ -1691,6 +1812,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
>       hctx->queue_num = hctx_idx;
>       hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
>  
> +     hctx->tslice_us = 0;
> +     hctx->tslice_expiration = 0;
> +     hctx->tslice_cpu = -1;
> +     hctx->tslice_inc = 0;
> +     hctx->tslice_inc_count = 0;
> +
>       blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
>                                       blk_mq_hctx_notify, hctx);
>       blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
> @@ -2006,6 +2133,18 @@ struct request_queue 
> *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>                                               node))
>                       goto err_hctxs;
>  
> +             if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_pending_mask,
> +                                             GFP_KERNEL, node))
> +                     goto err_hctxs;
> +
> +             if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_service_mask,
> +                                             GFP_KERNEL, node))
> +                     goto err_hctxs;
> +
> +             if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_next_mask,
> +                                             GFP_KERNEL, node))
> +                     goto err_hctxs;
> +
>               atomic_set(&hctxs[i]->nr_active, 0);
>               hctxs[i]->numa_node = node;
>               hctxs[i]->queue_num = i;
> @@ -2069,6 +2208,9 @@ err_hctxs:
>               if (!hctxs[i])
>                       break;
>               free_cpumask_var(hctxs[i]->cpumask);
> +             free_cpumask_var(hctxs[i]->cpu_pending_mask);
> +             free_cpumask_var(hctxs[i]->cpu_service_mask);
> +             free_cpumask_var(hctxs[i]->cpu_next_mask);
>               kfree(hctxs[i]);
>       }
>  err_map:
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 7fc9296..a8ca685 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -57,6 +57,15 @@ struct blk_mq_hw_ctx {
>  
>       atomic_t                nr_active;
>  
> +     cpumask_var_t           cpu_service_mask; /* CPUs not yet serviced */
> +     cpumask_var_t           cpu_pending_mask; /* CPUs with pending work */
> +     cpumask_var_t           cpu_next_mask;  /* CPUs to be serviced */
> +     int                     tslice_us;      /* time slice in µs */
> +     int                     tslice_inc;     /* extend time slice */
> +     int                     tslice_inc_count;
> +     unsigned long           tslice_expiration; /* in jiffies */
> +     int                     tslice_cpu;     /* cpu of time slice */
> +
>       struct blk_mq_cpu_notifier      cpu_notifier;
>       struct kobject          kobj;
>  
> -- 
> 1.9.1
>

Re: [RFC PATCH v2] blk-mq: Introduce per sw queue time-slice

Reply via email to