On Wed, 2018-02-07 at 12:07 -0800, [email protected] wrote:
> Ah, you're right. u64_stat_sync doesn't imply barriers, so we want
> something like the following.
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index df93102..d6edf3b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -593,7 +593,7 @@ static void blk_mq_rq_update_aborted_gstate(struct
> request *rq, u64 gstate)
> */
> local_irq_save(flags);
> u64_stats_update_begin(&rq->aborted_gstate_sync);
> - rq->aborted_gstate = gstate;
> + smp_store_release(&rq->aborted_gstate, gstate);
> u64_stats_update_end(&rq->aborted_gstate_sync);
> local_irq_restore(flags);
> }
> @@ -605,7 +605,7 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
>
> do {
> start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
> - aborted_gstate = rq->aborted_gstate;
> + aborted_gstate = smp_load_acquire(&rq->aborted_gstate);
> } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
>
> return aborted_gstate;
> @@ -836,8 +836,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool
> reserved)
> * ->aborted_gstate is set, this may lead to ignored
> * completions and further spurious timeouts.
> */
> - blk_mq_rq_update_aborted_gstate(req, 0);
> blk_add_timer(req);
> + blk_mq_rq_update_aborted_gstate(req, 0);
> break;
> case BLK_EH_NOT_HANDLED:
> break;
Hello Tejun,
With this patch applied I see requests for which it seems like the timeout
handler
did not get invoked:
sdc/hctx0/busy:0000000095e04b7c {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT|MQ_TIMEOUT_EXPIRED, .state=i
n_flight, .gstate=0xed/0xed, .tag=26, .internal_tag=-1, .cmd=Write(10) 2a 00 00
00 60 ba 00 00 08 00, .retries=0, .result = 0x50000, .flags=TAGGED|INITIALIZED,
.timeout=1.000, allocated 1093.180 s ago}
sdc/hctx0/busy:0000000065a64e9b {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT|MQ_TIMEOUT_EXPIRED, .state=i
n_flight, .gstate=0x5/0x5, .tag=27, .internal_tag=-1, .cmd=Write(10) 2a 00 00 00
62 d2 00 00 08 00, .retries=0, .result = 0x50000, .flags=TAGGED|INITIALIZED, .t
imeout=1.000, allocated 1093.180 s ago}
[ ... ]
sdc/hctx3/busy:00000000479cc2a9 {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT, .state=in_flight, .gstate=0
x11/0x11, .tag=57, .internal_tag=-1, .cmd=Write(10) 2a 00 00 00 61 d2 00 00 08 0
0, .retries=0, .result = 0x0, .flags=TAGGED|INITIALIZED, .timeout=1.000, allocat
ed 1093.150 s ago}
sdc/hctx3/busy:000000008fd130d5 {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT, .state=in_flight, .gstate=0
xd/0xd, .tag=61, .internal_tag=-1, .cmd=Write(10) 2a 00 00 00 c3 94 00 00 08 00,
.retries=0, .result = 0x0, .flags=TAGGED|INITIALIZED, .timeout=1.000, allocated
1093.140 s ago}
As one can see for some requests MQ_TIMEOUT_EXPIRED is set and .result =
0x50000.
The value of .result means that the SCSI error handler has submitted an abort
(see
also scmnd->result = DID_ABORT << 16 in drivers/infiniband/ulp/srp/ib_srp.c).
For
the last two requests shown above however MQ_TIMEOUT_EXPIRED is not set and the
SCSI result has value 0. I think that means that it can happen that a request
times out but that the timeout handler does not get invoked ...
Thanks,
Bart.