Re: [PATCH 09/14] blk-mq: ensure that plug lists don't straddle hardware queues

Ming Lei Tue, 30 Oct 2018 01:09:55 -0700

On Mon, Oct 29, 2018 at 01:49:18PM -0600, Jens Axboe wrote:
> On 10/29/18 1:30 PM, Jens Axboe wrote:
> > On 10/29/18 1:27 PM, Bart Van Assche wrote:
> >> On Mon, 2018-10-29 at 10:37 -0600, Jens Axboe wrote:
> >>>  void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> >>>  {
> >>>   struct blk_mq_ctx *this_ctx;
> >>> @@ -1628,7 +1649,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, 
> >>> bool from_schedule)
> >>>   struct request *rq;
> >>>   LIST_HEAD(list);
> >>>   LIST_HEAD(ctx_list);
> >>> - unsigned int depth;
> >>> + unsigned int depth, this_flags;
> >>>  
> >>>   list_splice_init(&plug->mq_list, &list);
> >>>  
> >>> @@ -1636,13 +1657,14 @@ void blk_mq_flush_plug_list(struct blk_plug 
> >>> *plug, bool from_schedule)
> >>>  
> >>>   this_q = NULL;
> >>>   this_ctx = NULL;
> >>> + this_flags = 0;
> >>>   depth = 0;
> >>>  
> >>>   while (!list_empty(&list)) {
> >>>           rq = list_entry_rq(list.next);
> >>>           list_del_init(&rq->queuelist);
> >>>           BUG_ON(!rq->q);
> >>> -         if (rq->mq_ctx != this_ctx) {
> >>> +         if (!ctx_match(rq, this_ctx, this_flags)) {
> >>>                   if (this_ctx) {
> >>>                           trace_block_unplug(this_q, depth, 
> >>> !from_schedule);
> >>>                           blk_mq_sched_insert_requests(this_q, this_ctx,
> >>> @@ -1650,6 +1672,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, 
> >>> bool from_schedule)
> >>>                                                           from_schedule);
> >>>                   }
> >>>  
> >>> +                 this_flags = rq->cmd_flags;
> >>>                   this_ctx = rq->mq_ctx;
> >>>                   this_q = rq->q;
> >>>                   depth = 0;
> >>
> >> This patch will cause the function stored in the flags_to_type pointer to 
> >> be
> >> called 2 * (n - 1) times where n is the number of elements in 'list' when
> >> blk_mq_sched_insert_requests() is called. Have you considered to rearrange
> >> the code such that that number of calls is reduced to n?
> > 
> > One alternative is to improve the sorting, but then we'd need to call
> > it in there instead. My longer term plan is something like the below,
> > which will reduce the number of calls in general, not just for
> > this path. But that is a separate change, should not be mixed
> > with this one.
> 
> Here's an updated one that applies on top of the current tree,
> and also uses this information to sort efficiently. This eliminates
> all this, and also makes the whole thing more clear.
> 
> I'll split this into two patches, just didn't want to include in
> the series just yet.
> 
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 7922dba81497..397985808b75 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -219,7 +219,7 @@ static void flush_end_io(struct request *flush_rq, 
> blk_status_t error)
>  
>       /* release the tag's ownership to the req cloned from */
>       spin_lock_irqsave(&fq->mq_flush_lock, flags);
> -     hctx = blk_mq_map_queue(q, flush_rq->cmd_flags, flush_rq->mq_ctx->cpu);
> +     hctx = flush_rq->mq_hctx;
>       if (!q->elevator) {
>               blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
>               flush_rq->tag = -1;
> @@ -307,13 +307,13 @@ static bool blk_kick_flush(struct request_queue *q, 
> struct blk_flush_queue *fq,
>       if (!q->elevator) {
>               fq->orig_rq = first_rq;
>               flush_rq->tag = first_rq->tag;
> -             hctx = blk_mq_map_queue(q, first_rq->cmd_flags,
> -                                     first_rq->mq_ctx->cpu);
> +             hctx = flush_rq->mq_hctx;
>               blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
>       } else {
>               flush_rq->internal_tag = first_rq->internal_tag;
>       }
>  
> +     flush_rq->mq_hctx = first_rq->mq_hctx;
>       flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
>       flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
>       flush_rq->rq_flags |= RQF_FLUSH_SEQ;
> @@ -326,13 +326,11 @@ static bool blk_kick_flush(struct request_queue *q, 
> struct blk_flush_queue *fq,
>  static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
>  {
>       struct request_queue *q = rq->q;
> -     struct blk_mq_hw_ctx *hctx;
> +     struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>       struct blk_mq_ctx *ctx = rq->mq_ctx;
>       unsigned long flags;
>       struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
>  
> -     hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
> -
>       if (q->elevator) {
>               WARN_ON(rq->tag < 0);
>               blk_mq_put_driver_tag_hctx(hctx, rq);
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index fac70c81b7de..cde19be36135 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -427,10 +427,8 @@ struct show_busy_params {
>  static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
>  {
>       const struct show_busy_params *params = data;
> -     struct blk_mq_hw_ctx *hctx;
>  
> -     hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
> -     if (hctx == params->hctx)
> +     if (rq->mq_hctx == params->hctx)
>               __blk_mq_debugfs_rq_show(params->m,
>                                        list_entry_rq(&rq->queuelist));
>  }
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index d232ecf3290c..25c558358255 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -367,9 +367,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
> at_head,
>       struct request_queue *q = rq->q;
>       struct elevator_queue *e = q->elevator;
>       struct blk_mq_ctx *ctx = rq->mq_ctx;
> -     struct blk_mq_hw_ctx *hctx;
> -
> -     hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
> +     struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>  
>       /* flush rq in flush machinery need to be dispatched directly */
>       if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
> @@ -399,16 +397,10 @@ void blk_mq_sched_insert_request(struct request *rq, 
> bool at_head,
>  }
>  
>  void blk_mq_sched_insert_requests(struct request_queue *q,
> -                               struct blk_mq_ctx *ctx,
> +                               struct blk_mq_hw_ctx *hctx,
>                                 struct list_head *list, bool run_queue_async)
>  {
> -     struct blk_mq_hw_ctx *hctx;
>       struct elevator_queue *e;
> -     struct request *rq;
> -
> -     /* For list inserts, requests better be on the same hw queue */
> -     rq = list_first_entry(list, struct request, queuelist);
> -     hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
>  
>       e = hctx->queue->elevator;
>       if (e && e->type->ops.mq.insert_requests)
> @@ -424,7 +416,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
>                       if (list_empty(list))
>                               return;
>               }
> -             blk_mq_insert_requests(hctx, ctx, list);
> +             blk_mq_insert_requests(hctx, list);
>       }
>  
>       blk_mq_run_hw_queue(hctx, run_queue_async);
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 8a9544203173..a42547213f58 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -20,7 +20,7 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
>  void blk_mq_sched_insert_request(struct request *rq, bool at_head,
>                                bool run_queue, bool async);
>  void blk_mq_sched_insert_requests(struct request_queue *q,
> -                               struct blk_mq_ctx *ctx,
> +                               struct blk_mq_hw_ctx *hctx,
>                                 struct list_head *list, bool run_queue_async);
>  
>  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 478a959357f5..fb836d818b80 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -527,14 +527,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>   */
>  u32 blk_mq_unique_tag(struct request *rq)
>  {
> -     struct request_queue *q = rq->q;
> -     struct blk_mq_hw_ctx *hctx;
> -     int hwq = 0;
> -
> -     hctx = blk_mq_map_queue(q, rq->cmd_flags, rq->mq_ctx->cpu);
> -     hwq = hctx->queue_num;
> -
> -     return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
> +     return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
>               (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
>  }
>  EXPORT_SYMBOL(blk_mq_unique_tag);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 37310cc55733..17ea522bd7c1 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -300,6 +300,7 @@ static struct request *blk_mq_rq_ctx_init(struct 
> blk_mq_alloc_data *data,
>       /* csd/requeue_work/fifo_time is initialized before use */
>       rq->q = data->q;
>       rq->mq_ctx = data->ctx;
> +     rq->mq_hctx = data->hctx;
>       rq->rq_flags = rq_flags;
>       rq->cpu = -1;
>       rq->cmd_flags = op;
> @@ -473,10 +474,11 @@ static void __blk_mq_free_request(struct request *rq)
>  {
>       struct request_queue *q = rq->q;
>       struct blk_mq_ctx *ctx = rq->mq_ctx;
> -     struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, 
> ctx->cpu);
> +     struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>       const int sched_tag = rq->internal_tag;
>  
>       blk_pm_mark_last_busy(rq);
> +     rq->mq_hctx = NULL;
>       if (rq->tag != -1)
>               blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
>       if (sched_tag != -1)
> @@ -490,7 +492,7 @@ void blk_mq_free_request(struct request *rq)
>       struct request_queue *q = rq->q;
>       struct elevator_queue *e = q->elevator;
>       struct blk_mq_ctx *ctx = rq->mq_ctx;
> -     struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, 
> ctx->cpu);
> +     struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>  
>       if (rq->rq_flags & RQF_ELVPRIV) {
>               if (e && e->type->ops.mq.finish_request)
> @@ -982,7 +984,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
>  {
>       struct blk_mq_alloc_data data = {
>               .q = rq->q,
> -             .hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu),
> +             .hctx = rq->mq_hctx,
>               .flags = BLK_MQ_REQ_NOWAIT,
>               .cmd_flags = rq->cmd_flags,
>       };
> @@ -1148,7 +1150,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
> struct list_head *list,
>  
>               rq = list_first_entry(list, struct request, queuelist);
>  
> -             hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
> +             hctx = rq->mq_hctx;
>               if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
>                       break;
>  
> @@ -1578,9 +1580,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx 
> *hctx, struct request *rq,
>   */
>  void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
>  {
> -     struct blk_mq_ctx *ctx = rq->mq_ctx;
> -     struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
> -                                                     ctx->cpu);
> +     struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>  
>       spin_lock(&hctx->lock);
>       list_add_tail(&rq->queuelist, &hctx->dispatch);
> @@ -1590,10 +1590,10 @@ void blk_mq_request_bypass_insert(struct request *rq, 
> bool run_queue)
>               blk_mq_run_hw_queue(hctx, false);
>  }
>  
> -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx 
> *ctx,
> -                         struct list_head *list)
> +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head 
> *list)
>  
>  {
> +     struct blk_mq_ctx *ctx = NULL;
>       struct request *rq;
>  
>       /*
> @@ -1601,7 +1601,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, 
> struct blk_mq_ctx *ctx,
>        * offline now
>        */
>       list_for_each_entry(rq, list, queuelist) {
> -             BUG_ON(rq->mq_ctx != ctx);
> +             BUG_ON(ctx && rq->mq_ctx != ctx);
> +             ctx = rq->mq_ctx;
>               trace_block_rq_insert(hctx->queue, rq);
>       }
>  
> @@ -1611,84 +1612,61 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx 
> *hctx, struct blk_mq_ctx *ctx,
>       spin_unlock(&ctx->lock);
>  }
>  
> -static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
> +static int plug_hctx_cmp(void *priv, struct list_head *a, struct list_head 
> *b)
>  {
>       struct request *rqa = container_of(a, struct request, queuelist);
>       struct request *rqb = container_of(b, struct request, queuelist);
>  
> -     return !(rqa->mq_ctx < rqb->mq_ctx ||
> -              (rqa->mq_ctx == rqb->mq_ctx &&
> +     return !(rqa->mq_hctx < rqb->mq_hctx ||
> +              (rqa->mq_hctx == rqb->mq_hctx &&
>                 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
>  }
>  
> -/*
> - * Need to ensure that the hardware queue matches, so we don't submit
> - * a list of requests that end up on different hardware queues.
> - */
> -static bool ctx_match(struct request *req, struct blk_mq_ctx *ctx,
> -                   unsigned int flags)
> -{
> -     if (req->mq_ctx != ctx)
> -             return false;
> -
> -     /*
> -      * If we just have one map, then we know the hctx will match
> -      * if the ctx matches
> -      */
> -     if (req->q->tag_set->nr_maps == 1)
> -             return true;
> -
> -     return blk_mq_map_queue(req->q, req->cmd_flags, ctx->cpu) ==
> -             blk_mq_map_queue(req->q, flags, ctx->cpu);
> -}
> -
>  void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
>  {
> -     struct blk_mq_ctx *this_ctx;
> +     struct blk_mq_hw_ctx *this_hctx;
>       struct request_queue *this_q;
>       struct request *rq;
>       LIST_HEAD(list);
> -     LIST_HEAD(ctx_list);
> -     unsigned int depth, this_flags;
> +     LIST_HEAD(hctx_list);
> +     unsigned int depth;
>  
>       list_splice_init(&plug->mq_list, &list);
>  
> -     list_sort(NULL, &list, plug_ctx_cmp);
> +     list_sort(NULL, &list, plug_hctx_cmp);
>  
>       this_q = NULL;
> -     this_ctx = NULL;
> -     this_flags = 0;
> +     this_hctx = NULL;
>       depth = 0;
>  
>       while (!list_empty(&list)) {
>               rq = list_entry_rq(list.next);
>               list_del_init(&rq->queuelist);
>               BUG_ON(!rq->q);
> -             if (!ctx_match(rq, this_ctx, this_flags)) {
> -                     if (this_ctx) {
> +             if (rq->mq_hctx != this_hctx) {
> +                     if (this_hctx) {
>                               trace_block_unplug(this_q, depth, 
> !from_schedule);
> -                             blk_mq_sched_insert_requests(this_q, this_ctx,
> -                                                             &ctx_list,
> +                             blk_mq_sched_insert_requests(this_q, this_hctx,
> +                                                             &hctx_list,
>                                                               from_schedule);
>                       }


Requests can be added to plug list from different ctx because of
preemption. However, blk_mq_sched_insert_requests() requires that
all requests in 'hctx_list' belong to same ctx. 

Thanks,
Ming

Re: [PATCH 09/14] blk-mq: ensure that plug lists don't straddle hardware queues

Reply via email to