On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote:
> On 09/02/2017 09:17 AM, Ming Lei wrote:
> > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct 
> > blk_mq_hw_ctx *hctx)
> >     if (!list_empty(&rq_list)) {
> >             blk_mq_sched_mark_restart_hctx(hctx);
> >             do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
> > -   } else if (!has_sched_dispatch) {
> > +   } else if (!has_sched_dispatch && !q->queue_depth) {
> > +           /*
> > +            * If there is no per-request_queue depth, we
> > +            * flush all requests in this hw queue, otherwise
> > +            * pick up request one by one from sw queue for
> > +            * avoiding to mess up I/O merge when dispatch
> > +            * is busy, which can be triggered easily by
> > +            * per-request_queue queue depth
> > +            */
> >             blk_mq_flush_busy_ctxs(hctx, &rq_list);
> >             blk_mq_dispatch_rq_list(q, &rq_list);
> >     }
> 
> I don't like this part at all. It's a heuristic, and on top of that,
> it's a heuristic based on a weak assumption that if q->queue_depth == 0,
> then we never run into BUSY constraints. I think that would be stronger
> if it was based on "is this using shared tags", but even that is not
> great at all. It's perfectly possible to have a shared tags setup and
> never run into resource constraints. The opposite condition is also true
> - you can run without shared tags, and yet hit resource constraints
> constantly. Hence this is NOT a good test for deciding whether to flush
> everything at once or not. In short, I think a much better test case
> would be "has this device ever returned BLK_STS_RESOURCE. As it so
> happens, that's easy enough to keep track of and base this test on.

Hi Jens,

The attached patch follows your suggestion, and uses EWMA to
compute the average length of hctx->dispatch, then only flush
all requests from ctxs if the average length is zero, what do
you think about this approach? Or other suggestions?


diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7a27f262c96a..c420c775b9c0 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -232,6 +232,14 @@ static int hctx_flags_show(void *data, struct seq_file *m)
        return 0;
 }
 
+static int hctx_dispatch_len_show(void *data, struct seq_file *m)
+{
+       struct blk_mq_hw_ctx *hctx = data;
+
+       seq_printf(m, "%u\n", hctx->dispatch_len);
+       return 0;
+}
+
 #define REQ_OP_NAME(name) [REQ_OP_##name] = #name
 static const char *const op_name[] = {
        REQ_OP_NAME(READ),
@@ -763,6 +771,7 @@ static const struct blk_mq_debugfs_attr 
blk_mq_debugfs_hctx_attrs[] = {
        {"state", 0400, hctx_state_show},
        {"flags", 0400, hctx_flags_show},
        {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops},
+       {"dispatch_length", 0400, hctx_dispatch_len_show},
        {"busy", 0400, hctx_busy_show},
        {"ctx_map", 0400, hctx_ctx_map_show},
        {"tags", 0400, hctx_tags_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a5712dd67783..91a6eeaadaf1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -102,7 +102,7 @@ static void blk_mq_do_dispatch_sched(struct request_queue 
*q,
                if (!rq)
                        break;
                list_add(&rq->queuelist, &rq_list);
-       } while (blk_mq_dispatch_rq_list(q, &rq_list));
+       } while (blk_mq_dispatch_rq_list(q, &rq_list, 1));
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -134,19 +134,51 @@ static void blk_mq_do_dispatch_ctx(struct request_queue 
*q,
                /* round robin for fair dispatch */
                ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
 
-               dispatched = blk_mq_dispatch_rq_list(q, &rq_list);
+               dispatched = blk_mq_dispatch_rq_list(q, &rq_list, 1);
        } while (dispatched);
 
        if (!dispatched)
                WRITE_ONCE(hctx->dispatch_from, ctx);
 }
 
+/* borrowed from bcache */
+void ewma_add(unsigned *ewma, unsigned val, unsigned weight)
+{
+       unsigned  result = READ_ONCE(*ewma);
+
+       if (val == 1) {
+               result += 1;
+       } else {
+               result *= weight - 1;
+               result += val;
+               result /= weight;
+       }
+       WRITE_ONCE(*ewma, result);
+}
+
+/*
+ * We use EWMA to compute the average length of dispatch list.
+ * It is totally lockless, but we can survive that since it
+ * is just a hint.
+ *
+ * We only flush requests from all ctxs if the average length
+ * of dispatch list is zero, otherwise the hw queue can be
+ * thought as busy and we dequeue request from ctxs one by
+ * one
+ */
+static void blk_mq_update_dispatch_length(struct blk_mq_hw_ctx *hctx,
+                                         unsigned cnt)
+{
+       ewma_add(&hctx->dispatch_len, cnt, 8);
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
        LIST_HEAD(rq_list);
+       unsigned rq_cnt = 0;
 
        /* RCU or SRCU read lock is needed before checking quiesced flag */
        if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -162,9 +194,14 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
                spin_lock(&hctx->lock);
                if (!list_empty(&hctx->dispatch))
                        list_splice_init(&hctx->dispatch, &rq_list);
+               rq_cnt = hctx->cnt;
+               hctx->cnt = 0;
                spin_unlock(&hctx->lock);
        }
 
+       if (!has_sched_dispatch)
+               blk_mq_update_dispatch_length(hctx, rq_cnt);
+
        /*
         * Only ask the scheduler for requests, if we didn't have residual
         * requests from the dispatch list. This is to avoid the case where
@@ -176,7 +213,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
-               blk_mq_dispatch_rq_list(q, &rq_list);
+               blk_mq_dispatch_rq_list(q, &rq_list, rq_cnt);
 
                /*
                 * We may clear DISPATCH_BUSY just after it
@@ -204,16 +241,16 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 
        if (!has_sched_dispatch) {
                /*
-                * If there is no per-request_queue depth, we
+                * If dispatch doesn't run out of resource, we
                 * flush all requests in this hw queue, otherwise
                 * pick up request one by one from sw queue for
                 * avoiding to mess up I/O merge when dispatch
                 * run out of resource, which can be triggered
                 * easily by per-request_queue queue depth
                 */
-               if (!q->queue_depth) {
-                       blk_mq_flush_busy_ctxs(hctx, &rq_list);
-                       blk_mq_dispatch_rq_list(q, &rq_list);
+               if (!hctx->dispatch_len) {
+                       rq_cnt = blk_mq_flush_busy_ctxs(hctx, &rq_list);
+                       blk_mq_dispatch_rq_list(q, &rq_list, rq_cnt);
                } else {
                        blk_mq_do_dispatch_ctx(q, hctx);
                }
@@ -346,6 +383,7 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx 
*hctx,
         * the dispatch list.
         */
        spin_lock(&hctx->lock);
+       hctx->cnt++;
        list_add(&rq->queuelist, &hctx->dispatch);
        set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state);
        spin_unlock(&hctx->lock);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1bb45e995da3..b2db5d6d568e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -850,6 +850,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 struct flush_busy_ctx_data {
        struct blk_mq_hw_ctx *hctx;
        struct list_head *list;
+       unsigned cnt;
 };
 
 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
@@ -857,11 +858,16 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned 
int bitnr, void *data)
        struct flush_busy_ctx_data *flush_data = data;
        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+       unsigned cnt = 0;
 
        sbitmap_clear_bit(sb, bitnr);
        spin_lock(&ctx->lock);
        list_splice_tail_init(&ctx->rq_list, flush_data->list);
+       cnt = ctx->cnt;
+       ctx->cnt = 0;;
        spin_unlock(&ctx->lock);
+       flush_data->cnt += cnt;
+
        return true;
 }
 
@@ -869,14 +875,17 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned 
int bitnr, void *data)
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
-void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+unsigned blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head 
*list)
 {
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
                .list = list,
+               .cnt  = 0,
        };
 
        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
+
+       return data.cnt;
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
@@ -897,6 +906,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, 
unsigned int bitnr, void *d
                list_del_init(&dispatch_data->rq->queuelist);
                if (list_empty(&ctx->rq_list))
                        sbitmap_clear_bit(sb, bitnr);
+               ctx->cnt--;
        }
        spin_unlock(&ctx->lock);
 
@@ -1052,7 +1062,9 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx 
*hctx)
        return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+/* @cnt represents the number of requests in @list */
+bool blk_mq_dispatch_rq_list(struct request_queue *q,
+                            struct list_head *list, unsigned cnt)
 {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
@@ -1139,6 +1151,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list)
                blk_mq_put_driver_tag(rq);
 
                spin_lock(&hctx->lock);
+               hctx->cnt += cnt - queued - errors;
                list_splice_init(list, &hctx->dispatch);
                /*
                 * DISPATCH_BUSY won't be cleared until all requests
@@ -1431,6 +1444,7 @@ static inline void __blk_mq_insert_req_list(struct 
blk_mq_hw_ctx *hctx,
                list_add(&rq->queuelist, &ctx->rq_list);
        else
                list_add_tail(&rq->queuelist, &ctx->rq_list);
+       ctx->cnt++;
 }
 
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
@@ -1454,6 +1468,7 @@ void blk_mq_request_bypass_insert(struct request *rq)
        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
 
        spin_lock(&hctx->lock);
+       hctx->cnt++;
        list_add_tail(&rq->queuelist, &hctx->dispatch);
        set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state);
        spin_unlock(&hctx->lock);
@@ -1941,6 +1956,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
        if (!list_empty(&ctx->rq_list)) {
                list_splice_init(&ctx->rq_list, &tmp);
                blk_mq_hctx_clear_pending(hctx, ctx);
+               ctx->cnt = 0;
        }
        spin_unlock(&ctx->lock);
 
@@ -1950,6 +1966,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
        spin_lock(&hctx->lock);
        list_splice_tail_init(&tmp, &hctx->dispatch);
        set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state);
+       hctx->cnt++;
        spin_unlock(&hctx->lock);
 
        blk_mq_run_hw_queue(hctx, true);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 231cfb0d973b..0da5a81cc41f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -9,6 +9,7 @@ struct blk_mq_ctx {
        struct {
                spinlock_t              lock;
                struct list_head        rq_list;
+               unsigned                cnt;
        }  ____cacheline_aligned_in_smp;
 
        unsigned int            cpu;
@@ -30,8 +31,9 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
-void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head 
*list);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *,
+                            unsigned);
+unsigned blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head 
*list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
                                bool wait);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 13f6c25fa461..778dbdd596f3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -11,6 +11,7 @@ struct blk_flush_queue;
 struct blk_mq_hw_ctx {
        struct {
                spinlock_t              lock;
+               unsigned                cnt;
                struct list_head        dispatch;
                unsigned long           state;          /* BLK_MQ_S_* flags */
        } ____cacheline_aligned_in_smp;
@@ -31,6 +32,7 @@ struct blk_mq_hw_ctx {
        struct sbitmap          ctx_map;
 
        struct blk_mq_ctx       *dispatch_from;
+       unsigned                dispatch_len;
 
        struct blk_mq_ctx       **ctxs;
        unsigned int            nr_ctx;


Thanks,
Ming

Reply via email to