On Fri, 2017-11-03 at 23:18 +0800, Ming Lei wrote:
> Subject: [PATCH] SCSI_MQ: fix IO hang in case of queue busy
> 
> We have to insert the rq back before checking .device_busy,
> otherwise When IO completes just after the check and before
> this req is added to hctx->dispatch, this queue may never get
> chance to be run, then this IO may hang forever.
> 
> This patch introduces BLK_STS_RESOURCE_OK for handling this
> issue.
> 
> Signed-off-by: Ming Lei <[email protected]>
> ---
>  block/blk-mq.c            | 17 +++++++++++++++++
>  drivers/scsi/scsi_lib.c   |  8 ++++++++
>  include/linux/blk-mq.h    |  1 +
>  include/linux/blk_types.h |  1 +
>  4 files changed, 27 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index e4d2490f4e7e..e1e03576edca 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -660,6 +660,16 @@ static void __blk_mq_requeue_request(struct request *rq)
>       }
>  }
>  
> +void blk_mq_reinsert_request_hctx(struct blk_mq_hw_ctx *hctx, struct request 
> *rq)
> +{
> +     __blk_mq_requeue_request(rq);
> +
> +     spin_lock(&hctx->lock);
> +     list_add_tail(&rq->queuelist, &hctx->dispatch);
> +     spin_unlock(&hctx->lock);
> +}
> +EXPORT_SYMBOL(blk_mq_reinsert_request_hctx);
> +
>  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
>  {
>       __blk_mq_requeue_request(rq);
> @@ -1165,6 +1175,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
> struct list_head *list,
>                       list_add(&rq->queuelist, list);
>                       __blk_mq_requeue_request(rq);
>                       break;
> +             } else if (ret == BLK_STS_RESOURCE_OK) {
> +                     /*
> +                      * BLK_STS_RESOURCE_OK means driver handled this
> +                      * STS_RESOURCE already, we just need to stop dispatch.
> +                      */
> +                     break;
>               }
>  
>   fail_rq:
> @@ -1656,6 +1672,7 @@ static void __blk_mq_try_issue_directly(struct 
> blk_mq_hw_ctx *hctx,
>       ret = q->mq_ops->queue_rq(hctx, &bd);
>       switch (ret) {
>       case BLK_STS_OK:
> +     case BLK_STS_RESOURCE_OK:
>               *cookie = new_cookie;
>               return;
>       case BLK_STS_RESOURCE:
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 7f218ef61900..0165c1caed82 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -2030,9 +2030,17 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx 
> *hctx,
>       case BLK_STS_OK:
>               break;
>       case BLK_STS_RESOURCE:
> +             /*
> +              * We have to insert the rq back before checking .device_busy,
> +              * otherwise when IO completes just after the check and before
> +              * this req is added to hctx->dispatch, this queue may never get
> +              * chance to be run, then this IO may hang forever.
> +              */
> +             blk_mq_reinsert_request_hctx(hctx, req);
>               if (atomic_read(&sdev->device_busy) == 0 &&
>                   !scsi_device_blocked(sdev))
>                       blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
> +             ret = BLK_STS_RESOURCE_OK;
>               break;
>       default:
>               /*
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index e5e6becd57d3..4740f643d8c5 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -244,6 +244,7 @@ void blk_mq_start_request(struct request *rq);
>  void blk_mq_end_request(struct request *rq, blk_status_t error);
>  void __blk_mq_end_request(struct request *rq, blk_status_t error);
>  
> +void blk_mq_reinsert_request_hctx(struct blk_mq_hw_ctx *hctx, struct request 
> *rq);
>  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
>  void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
>                               bool kick_requeue_list);
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 3385c89f402e..b630cc026a93 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -32,6 +32,7 @@ typedef u8 __bitwise blk_status_t;
>  #define BLK_STS_PROTECTION   ((__force blk_status_t)8)
>  #define BLK_STS_RESOURCE     ((__force blk_status_t)9)
>  #define BLK_STS_IOERR                ((__force blk_status_t)10)
> +#define BLK_STS_RESOURCE_OK  ((__force blk_status_t)11)
>  
>  /* hack for device mapper, don't use elsewhere: */
>  #define BLK_STS_DM_REQUEUE    ((__force blk_status_t)11)

Running test one with this patch applied on top of Jens' for-next branch yields
the same result as without this patch:
[ ... ]
Error: unloading kernel module ib_srp failed
Test /home/bart/software/infiniband/srp-test/tests/01 failed
# dmesg -c >/dev/null; echo w > /proc/sysrq-trigger; dmesg
[  325.167295] sysrq: SysRq : Show Blocked State
[  325.173612]   task                        PC stack   pid father
[  325.181903] kworker/10:1    D    0   165      2 0x80000000
[  325.189396] Workqueue: srp_remove srp_remove_work [ib_srp]
[  325.196222] Call Trace:
[  325.199622]  __schedule+0x2fa/0xbb0
[  325.204227]  schedule+0x36/0x90
[  325.208390]  async_synchronize_cookie_domain+0x88/0x130
[  325.214915]  ? finish_wait+0x90/0x90
[  325.219574]  async_synchronize_full_domain+0x18/0x20
[  325.225816]  sd_remove+0x4d/0xc0 [sd_mod]
[  325.230949]  device_release_driver_internal+0x160/0x210
[  325.237479]  device_release_driver+0x12/0x20
[  325.242904]  bus_remove_device+0x100/0x180
[  325.248163]  device_del+0x1d8/0x340
[  325.252735]  __scsi_remove_device+0xfc/0x130
[  325.258192]  scsi_forget_host+0x25/0x70
[  325.263173]  scsi_remove_host+0x79/0x120
[  325.268227]  srp_remove_work+0x90/0x1d0 [ib_srp]
[  325.274059]  process_one_work+0x20a/0x660
[  325.279243]  worker_thread+0x3d/0x3b0
[  325.284004]  kthread+0x13a/0x150
[  325.288232]  ? process_one_work+0x660/0x660
[  325.293575]  ? kthread_create_on_node+0x40/0x40
[  325.299334]  ret_from_fork+0x27/0x40
[  325.304002] kworker/5:1     D    0   180      2 0x80000000
[  325.310768] Workqueue: kaluad alua_rtpg_work [scsi_dh_alua]
[  325.317674] Call Trace:
[  325.321047]  __schedule+0x2fa/0xbb0
[  325.325615]  ? trace_hardirqs_on_caller+0xfb/0x190
[  325.331612]  schedule+0x36/0x90
[  325.335797]  schedule_timeout+0x22c/0x570
[  325.340917]  ? call_timer_fn+0x330/0x330
[  325.345972]  ? wait_for_completion_io_timeout+0xf7/0x180
[  325.352558]  io_schedule_timeout+0x1e/0x50
[  325.357808]  ? io_schedule_timeout+0x1e/0x50
[  325.363262]  wait_for_completion_io_timeout+0x11f/0x180
[  325.369793]  ? wake_up_q+0x80/0x80
[  325.374267]  blk_execute_rq+0x86/0xc0
[  325.379012]  scsi_execute+0xdb/0x1f0
[  325.383678]  alua_rtpg_work+0x2b3/0xe8a [scsi_dh_alua]
[  325.390130]  process_one_work+0x20a/0x660
[  325.395818]  worker_thread+0x3d/0x3b0
[  325.401248]  kthread+0x13a/0x150
[  325.406056]  ? process_one_work+0x660/0x660
[  325.412070]  ? kthread_create_on_node+0x40/0x40
[  325.418474]  ret_from_fork+0x27/0x40
[  325.423874] kworker/u66:1   D    0   331      2 0x80000000
[  325.431374] Workqueue: events_unbound async_run_entry_fn
[  325.438651] Call Trace:
[  325.442662]  __schedule+0x2fa/0xbb0
[  325.447835]  ? trace_hardirqs_on_caller+0xfb/0x190
[  325.454522]  schedule+0x36/0x90
[  325.459339]  schedule_timeout+0x22c/0x570
[  325.465097]  ? call_timer_fn+0x330/0x330
[  325.470800]  ? wait_for_completion_io_timeout+0xf7/0x180
[  325.478017]  io_schedule_timeout+0x1e/0x50
[  325.483935]  ? io_schedule_timeout+0x1e/0x50
[  325.490015]  wait_for_completion_io_timeout+0x11f/0x180
[  325.497134]  ? wake_up_q+0x80/0x80
[  325.502272]  blk_execute_rq+0x86/0xc0
[  325.507652]  scsi_execute+0xdb/0x1f0
[  325.512960]  sd_revalidate_disk+0xed/0x1c70 [sd_mod]
[  325.519846]  sd_probe_async+0xc3/0x1d0 [sd_mod]
[  325.526199]  async_run_entry_fn+0x38/0x160
[  325.532121]  process_one_work+0x20a/0x660
[  325.537897]  worker_thread+0x3d/0x3b0
[  325.543319]  kthread+0x13a/0x150
[  325.548200]  ? process_one_work+0x660/0x660
[  325.554204]  ? kthread_create_on_node+0x40/0x40
[  325.560605]  ret_from_fork+0x27/0x40

Reply via email to