Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-21 Thread Bart Van Assche
On Wed, 2018-02-21 at 11:21 -0800, t...@kernel.org wrote:
> Hello, Bart.
> 
> On Wed, Feb 21, 2018 at 06:53:05PM +, Bart Van Assche wrote:
> > On Sun, 2018-02-18 at 05:11 -0800, t...@kernel.org wrote:
> > > On Wed, Feb 14, 2018 at 04:58:56PM +, Bart Van Assche wrote:
> > > > With this patch applied the tests I ran so far pass.
> > > 
> > > Ah, great to hear.  Thanks a lot for testing.  Can you please verify
> > > the following?  It's the same approach but with RCU sync batching.
> > 
> > Hello Tejun,
> > 
> > After having merged kernel v4.16-rc2 into my kernel tree and after having
> > applied patch "Avoid that ATA error handling hangs"
> > (https://www.mail-archive.com/linux-scsi@vger.kernel.org/msg71145.html) I
> > have not been able to reproduce the reported crash - neither with the patch
> > applied that was posted on February 13th nor without that patch. This makes
> > me wonder whether we should drop the discussed patches unless someone comes
> > up with a reproducible test case?
> 
> It is an actual bug in that we actually can override the timer setting
> of the next request instance.  Given that the race window isn't that
> large, it makes sense that the reproducibility is affected by
> butterflies.  I think it makes sense to fix the bug.  Any chance you
> can test the new patch on top of the reproducible setup?

Hello Tejun,

Since I had not saved any of the trees that I had used during my tests I picked
several trees from the "git reflog" output and tried to reproduce the crash that
I had reported with these trees. Unfortunately so far without success.

Bart.




Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-21 Thread Bart Van Assche
On Sun, 2018-02-18 at 05:11 -0800, t...@kernel.org wrote:
> On Wed, Feb 14, 2018 at 04:58:56PM +, Bart Van Assche wrote:
> > With this patch applied the tests I ran so far pass.
> 
> Ah, great to hear.  Thanks a lot for testing.  Can you please verify
> the following?  It's the same approach but with RCU sync batching.

Hello Tejun,

After having merged kernel v4.16-rc2 into my kernel tree and after having
applied patch "Avoid that ATA error handling hangs"
(https://www.mail-archive.com/linux-scsi@vger.kernel.org/msg71145.html) I
have not been able to reproduce the reported crash - neither with the patch
applied that was posted on February 13th nor without that patch. This makes
me wonder whether we should drop the discussed patches unless someone comes
up with a reproducible test case?

Thanks,

Bart.




Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-21 Thread t...@kernel.org
Hello, Bart.

On Wed, Feb 21, 2018 at 06:53:05PM +, Bart Van Assche wrote:
> On Sun, 2018-02-18 at 05:11 -0800, t...@kernel.org wrote:
> > On Wed, Feb 14, 2018 at 04:58:56PM +, Bart Van Assche wrote:
> > > With this patch applied the tests I ran so far pass.
> > 
> > Ah, great to hear.  Thanks a lot for testing.  Can you please verify
> > the following?  It's the same approach but with RCU sync batching.
> 
> Hello Tejun,
> 
> After having merged kernel v4.16-rc2 into my kernel tree and after having
> applied patch "Avoid that ATA error handling hangs"
> (https://www.mail-archive.com/linux-scsi@vger.kernel.org/msg71145.html) I
> have not been able to reproduce the reported crash - neither with the patch
> applied that was posted on February 13th nor without that patch. This makes
> me wonder whether we should drop the discussed patches unless someone comes
> up with a reproducible test case?

It is an actual bug in that we actually can override the timer setting
of the next request instance.  Given that the race window isn't that
large, it makes sense that the reproducibility is affected by
butterflies.  I think it makes sense to fix the bug.  Any chance you
can test the new patch on top of the reproducible setup?

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-18 Thread t...@kernel.org
Hello, Bart.

On Wed, Feb 14, 2018 at 04:58:56PM +, Bart Van Assche wrote:
> With this patch applied the tests I ran so far pass.

Ah, great to hear.  Thanks a lot for testing.  Can you please verify
the following?  It's the same approach but with RCU sync batching.

Thanks.

Index: work/block/blk-mq.c
===
--- work.orig/block/blk-mq.c
+++ work/block/blk-mq.c
@@ -816,7 +816,8 @@ struct blk_mq_timeout_data {
unsigned int nr_expired;
 };
 
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct blk_mq_hw_ctx *hctx, struct request 
*req,
+   int *nr_resets, bool reserved)
 {
const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
@@ -831,13 +832,10 @@ static void blk_mq_rq_timed_out(struct r
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
-   /*
-* As nothing prevents from completion happening while
-* ->aborted_gstate is set, this may lead to ignored
-* completions and further spurious timeouts.
-*/
-   blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
+   req->rq_flags |= RQF_MQ_TIMEOUT_RESET;
+   (*nr_resets)++;
+   hctx->need_sync_rcu = true;
break;
case BLK_EH_NOT_HANDLED:
break;
@@ -874,13 +872,34 @@ static void blk_mq_check_expired(struct
time_after_eq(jiffies, deadline)) {
blk_mq_rq_update_aborted_gstate(rq, gstate);
data->nr_expired++;
-   hctx->nr_expired++;
+   hctx->need_sync_rcu = true;
} else if (!data->next_set || time_after(data->next, deadline)) {
data->next = deadline;
data->next_set = 1;
}
 }
 
+static void blk_mq_timeout_sync_rcu(struct request_queue *q)
+{
+   struct blk_mq_hw_ctx *hctx;
+   bool has_rcu = false;
+   int i;
+
+   queue_for_each_hw_ctx(q, hctx, i) {
+   if (!hctx->need_sync_rcu)
+   continue;
+
+   if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+   has_rcu = true;
+   else
+   synchronize_srcu(hctx->srcu);
+
+   hctx->need_sync_rcu = false;
+   }
+   if (has_rcu)
+   synchronize_rcu();
+}
+
 static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
 {
@@ -893,7 +912,25 @@ static void blk_mq_terminate_expired(str
 */
if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
READ_ONCE(rq->gstate) == rq->aborted_gstate)
-   blk_mq_rq_timed_out(rq, reserved);
+   blk_mq_rq_timed_out(hctx, rq, priv, reserved);
+}
+
+static void blk_mq_finish_timeout_reset(struct blk_mq_hw_ctx *hctx,
+   struct request *rq, void *priv, bool reserved)
+{
+   /*
+* @rq's timer reset has gone through rcu synchronization and is
+* visible now.  Allow normal completions again by resetting
+* ->aborted_gstate.  Don't clear RQF_MQ_TIMEOUT_RESET here as
+* there's no memory barrier around ->aborted_gstate.  Let
+* blk_add_timer() clear it later.
+*
+* As nothing prevents from completion happening while
+* ->aborted_gstate is set, this may lead to ignored completions
+* and further spurious timeouts.
+*/
+   if (rq->rq_flags & RQF_MQ_TIMEOUT_RESET)
+   blk_mq_rq_update_aborted_gstate(rq, 0);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
@@ -928,7 +965,7 @@ static void blk_mq_timeout_work(struct w
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, );
 
if (data.nr_expired) {
-   bool has_rcu = false;
+   int nr_resets = 0;
 
/*
 * Wait till everyone sees ->aborted_gstate.  The
@@ -936,22 +973,22 @@ static void blk_mq_timeout_work(struct w
 * becomes a problem, we can add per-hw_ctx rcu_head and
 * wait in parallel.
 */
-   queue_for_each_hw_ctx(q, hctx, i) {
-   if (!hctx->nr_expired)
-   continue;
-
-   if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-   has_rcu = true;
-   else
-   synchronize_srcu(hctx->srcu);
-
-   hctx->nr_expired = 0;
-   }
-   if (has_rcu)
-   synchronize_rcu();
+   blk_mq_timeout_sync_rcu(q);
 
/* terminate the ones we won */
-   blk_mq_queue_tag_busy_iter(q, 

Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-14 Thread Bart Van Assche
On Tue, 2018-02-13 at 13:20 -0800, t...@kernel.org wrote:
> On Thu, Feb 08, 2018 at 04:31:43PM +, Bart Van Assche wrote:
> > The crash is reported at address scsi_times_out+0x17 == scsi_times_out+23. 
> > The
> > instruction at that address tries to dereference scsi_cmnd.device (%rax). 
> > The
> > register dump shows that that pointer has the value NULL. The only function 
> > I
> > know of that clears the scsi_cmnd.device pointer is scsi_req_init(). The 
> > only
> > caller of that function in the SCSI core is scsi_initialize_rq(). That 
> > function
> > has two callers, namely scsi_init_command() and blk_get_request(). However,
> > the scsi_cmnd.device pointer is not cleared when a request finishes. This is
> > why I think that the above crash report indicates that scsi_times_out() was
> > called for a request that was being reinitialized and not by device 
> > hotplugging.
> 
> Can you please give the following patch a shot?  While timeout path is
> synchornizing against the completion path (and the following re-init)
> while taking back control of a timed-out request, it wasn't doing that
> while giving it back, so the timer registration could race against
> completion and re-issue.  I'm still not quite sure how that can lead
> to the oops tho.  Anyways, we need something like this one way or the
> other.
> 
> This isn't the final patch.  We should add batching-up of rcu
> synchronize calls similar to the abort path.
> 
> Thanks.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index df93102..b66aec3 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -816,7 +816,8 @@ struct blk_mq_timeout_data {
>   unsigned int nr_expired;
>  };
>  
> -static void blk_mq_rq_timed_out(struct request *req, bool reserved)
> +static void blk_mq_rq_timed_out(struct blk_mq_hw_ctx *hctx, struct request 
> *req,
> + bool reserved)
>  {
>   const struct blk_mq_ops *ops = req->q->mq_ops;
>   enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
> @@ -836,8 +837,12 @@ static void blk_mq_rq_timed_out(struct request *req, 
> bool reserved)
>* ->aborted_gstate is set, this may lead to ignored
>* completions and further spurious timeouts.
>*/
> - blk_mq_rq_update_aborted_gstate(req, 0);
>   blk_add_timer(req);
> + if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> + synchronize_rcu();
> + else
> + synchronize_srcu(hctx->srcu);
> + blk_mq_rq_update_aborted_gstate(req, 0);
>   break;
>   case BLK_EH_NOT_HANDLED:
>   break;
> @@ -893,7 +898,7 @@ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx 
> *hctx,
>*/
>   if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
>   READ_ONCE(rq->gstate) == rq->aborted_gstate)
> - blk_mq_rq_timed_out(rq, reserved);
> + blk_mq_rq_timed_out(hctx, rq, reserved);
>  }
>  
>  static void blk_mq_timeout_work(struct work_struct *work)

Hello Tejun,

With this patch applied the tests I ran so far pass.

Thanks,

Bart.





Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-13 Thread t...@kernel.org
Hello, Bart.

Sorry about the delay.

On Thu, Feb 08, 2018 at 04:31:43PM +, Bart Van Assche wrote:
> The crash is reported at address scsi_times_out+0x17 == scsi_times_out+23. The
> instruction at that address tries to dereference scsi_cmnd.device (%rax). The
> register dump shows that that pointer has the value NULL. The only function I
> know of that clears the scsi_cmnd.device pointer is scsi_req_init(). The only
> caller of that function in the SCSI core is scsi_initialize_rq(). That 
> function
> has two callers, namely scsi_init_command() and blk_get_request(). However,
> the scsi_cmnd.device pointer is not cleared when a request finishes. This is
> why I think that the above crash report indicates that scsi_times_out() was
> called for a request that was being reinitialized and not by device 
> hotplugging.

Can you please give the following patch a shot?  While timeout path is
synchornizing against the completion path (and the following re-init)
while taking back control of a timed-out request, it wasn't doing that
while giving it back, so the timer registration could race against
completion and re-issue.  I'm still not quite sure how that can lead
to the oops tho.  Anyways, we need something like this one way or the
other.

This isn't the final patch.  We should add batching-up of rcu
synchronize calls similar to the abort path.

Thanks.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index df93102..b66aec3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -816,7 +816,8 @@ struct blk_mq_timeout_data {
unsigned int nr_expired;
 };
 
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct blk_mq_hw_ctx *hctx, struct request 
*req,
+   bool reserved)
 {
const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
@@ -836,8 +837,12 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
reserved)
 * ->aborted_gstate is set, this may lead to ignored
 * completions and further spurious timeouts.
 */
-   blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
+   if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+   synchronize_rcu();
+   else
+   synchronize_srcu(hctx->srcu);
+   blk_mq_rq_update_aborted_gstate(req, 0);
break;
case BLK_EH_NOT_HANDLED:
break;
@@ -893,7 +898,7 @@ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx 
*hctx,
 */
if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
READ_ONCE(rq->gstate) == rq->aborted_gstate)
-   blk_mq_rq_timed_out(rq, reserved);
+   blk_mq_rq_timed_out(hctx, rq, reserved);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)



Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread t...@kernel.org
On Thu, Feb 08, 2018 at 05:48:32PM +, Bart Van Assche wrote:
> On Thu, 2018-02-08 at 09:40 -0800, t...@kernel.org wrote:
> > Heh, sorry about not being clear.  What I'm trying to say is that
> > scmd->device != NULL && device->host == NULL.  Or was this what you
> > were saying all along?
> 
> What I agree with is that the request pointer (req argument) is stored in %rdi
> and that mov 0x1b0(%rdi),%rax loads scmd->device into %rax. Since RIP ==
> scsi_times_out+0x17, since the instruction at that address tries to 
> dereference
> %rax and since the register dump shows that %rax == NULL I think that means 
> that
> scmd->device == NULL.

Ah, I was completely confused about which one had to be NULL.  You're
absolutely right.  Let's continue earlier in the thread.

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread Bart Van Assche
On Thu, 2018-02-08 at 09:40 -0800, t...@kernel.org wrote:
> Heh, sorry about not being clear.  What I'm trying to say is that
> scmd->device != NULL && device->host == NULL.  Or was this what you
> were saying all along?

What I agree with is that the request pointer (req argument) is stored in %rdi
and that mov 0x1b0(%rdi),%rax loads scmd->device into %rax. Since RIP ==
scsi_times_out+0x17, since the instruction at that address tries to dereference
%rax and since the register dump shows that %rax == NULL I think that means that
scmd->device == NULL.

Thanks,

Bart.






Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread t...@kernel.org
On Thu, Feb 08, 2018 at 05:37:46PM +, Bart Van Assche wrote:
> On Thu, 2018-02-08 at 09:19 -0800, t...@kernel.org wrote:
> > Hello, Bart.
> > 
> > On Thu, Feb 08, 2018 at 05:10:45PM +, Bart Van Assche wrote:
> > > I think "dereferencing a pointer" means reading the memory location that 
> > > pointer points
> > > at? Anyway, I think we both interpret the crash report in the same way, 
> > > namely that it
> > > means that scmd->device == NULL.
> > 
> > No, what I'm trying to say is that it's is the device->host reference
> > not the scmd->device reference.
> 
> Again, I think that means that we agree about the interpretation of the crash
> report. The big question here is what the next step should be to analyze this
> further and/or to avoid that this crash can occur?

Heh, sorry about not being clear.  What I'm trying to say is that
scmd->device != NULL && device->host == NULL.  Or was this what you
were saying all along?

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread Bart Van Assche
On Thu, 2018-02-08 at 09:19 -0800, t...@kernel.org wrote:
> Hello, Bart.
> 
> On Thu, Feb 08, 2018 at 05:10:45PM +, Bart Van Assche wrote:
> > I think "dereferencing a pointer" means reading the memory location that 
> > pointer points
> > at? Anyway, I think we both interpret the crash report in the same way, 
> > namely that it
> > means that scmd->device == NULL.
> 
> No, what I'm trying to say is that it's is the device->host reference
> not the scmd->device reference.

Again, I think that means that we agree about the interpretation of the crash
report. The big question here is what the next step should be to analyze this
further and/or to avoid that this crash can occur?

Thanks,

Bart.







Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread t...@kernel.org
Hello, Bart.

On Thu, Feb 08, 2018 at 05:10:45PM +, Bart Van Assche wrote:
> I think "dereferencing a pointer" means reading the memory location that 
> pointer points
> at? Anyway, I think we both interpret the crash report in the same way, 
> namely that it
> means that scmd->device == NULL.

No, what I'm trying to say is that it's is the device->host reference
not the scmd->device reference.

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread Bart Van Assche
On Thu, 2018-02-08 at 09:00 -0800, t...@kernel.org wrote:
> On Thu, Feb 08, 2018 at 04:31:43PM +, Bart Van Assche wrote:
> > The crash is reported at address scsi_times_out+0x17 == scsi_times_out+23. 
> > The
> > instruction at that address tries to dereference scsi_cmnd.device (%rax). 
> > The
> > register dump shows that that pointer has the value NULL. The only function 
> > I
> > know of that clears the scsi_cmnd.device pointer is scsi_req_init(). The 
> > only
> > caller of that function in the SCSI core is scsi_initialize_rq(). That 
> > function
> > has two callers, namely scsi_init_command() and blk_get_request(). However,
> > the scsi_cmnd.device pointer is not cleared when a request finishes. This is
> > why I think that the above crash report indicates that scsi_times_out() was
> > called for a request that was being reinitialized and not by device 
> > hotplugging.
> 
> I could be misreading it but scsi_cmnd->device dereference should be
> the following.
> 
> 0x5bdd <+13>:mov0x1b0(%rdi),%rax
> 
> %rdi is @req, 0x1b0(%rdi) seems to be the combined arithmetic of
> blk_mq_rq_to_pdu() and ->device dereference - 0x178 + 0x38.  The
> faulting access is (%rax), which is deref'ing host from device.

Hello Tejun,

I think "dereferencing a pointer" means reading the memory location that 
pointer points
at? Anyway, I think we both interpret the crash report in the same way, namely 
that it
means that scmd->device == NULL.

Thanks,

Bart.





Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread t...@kernel.org
Hello, Bart.

On Thu, Feb 08, 2018 at 04:31:43PM +, Bart Van Assche wrote:
> > That sounds more like a scsi hotplug bug than an issue in the timeout
> > code unless we messed up @req pointer to begin with.
> 
> I don't think that this is related to SCSI hotplugging: this crash does not
> occur with the v4.15 block layer core and it does not occur with my timeout
> handler rework patch applied either. I think that means that we cannot
> exclude the block layer core timeout handler rework as a possible cause.
> 
> The disassembler output is as follows:
> 
> (gdb) disas /s scsi_times_out
> Dump of assembler code for function scsi_times_out:
> drivers/scsi/scsi_error.c:
> 282 {
>0x5bd0 <+0>: push   %r13
>0x5bd2 <+2>: push   %r12
>0x5bd4 <+4>: push   %rbp
> ./include/linux/blk-mq.h:
> 300 return rq + 1;
>0x5bd5 <+5>: lea0x178(%rdi),%rbp
> drivers/scsi/scsi_error.c:
> 282 {
>0x5bdc <+12>:push   %rbx
> 283 struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
> 284 enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
> 285 struct Scsi_Host *host = scmd->device->host;
>0x5bdd <+13>:mov0x1b0(%rdi),%rax
> 282 {
>0x5be4 <+20>:mov%rdi,%rbx
> 283 struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
> 284 enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
> 285 struct Scsi_Host *host = scmd->device->host;
>0x5be7 <+23>:mov(%rax),%r13
>0x5bea <+26>:nopl   0x0(%rax,%rax,1)
> [ ... ]
> (gdb) print /x sizeof(struct request)
> $2 = 0x178
> (gdb) print &(((struct scsi_cmnd*)0)->device)
> $4 = (struct scsi_device **) 0x38 
> (gdb) print &(((struct scsi_device*)0)->host)   
> $5 = (struct Scsi_Host **) 0x0
> 
> The crash is reported at address scsi_times_out+0x17 == scsi_times_out+23. The
> instruction at that address tries to dereference scsi_cmnd.device (%rax). The
> register dump shows that that pointer has the value NULL. The only function I
> know of that clears the scsi_cmnd.device pointer is scsi_req_init(). The only
> caller of that function in the SCSI core is scsi_initialize_rq(). That 
> function
> has two callers, namely scsi_init_command() and blk_get_request(). However,
> the scsi_cmnd.device pointer is not cleared when a request finishes. This is
> why I think that the above crash report indicates that scsi_times_out() was
> called for a request that was being reinitialized and not by device 
> hotplugging.

I could be misreading it but scsi_cmnd->device dereference should be
the following.

0x5bdd <+13>:mov0x1b0(%rdi),%rax

%rdi is @req, 0x1b0(%rdi) seems to be the combined arithmetic of
blk_mq_rq_to_pdu() and ->device dereference - 0x178 + 0x38.  The
faulting access is (%rax), which is deref'ing host from device.

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread Bart Van Assche
On Thu, 2018-02-08 at 07:39 -0800, t...@kernel.org wrote:
> On Thu, Feb 08, 2018 at 01:09:57AM +, Bart Van Assche wrote:
> > On Wed, 2018-02-07 at 23:48 +, Bart Van Assche wrote:
> > > With this patch applied I see requests for which it seems like the 
> > > timeout handler
> > > did not get invoked: [ ... ]
> > 
> > I just noticed the following in the system log, which is probably the 
> > reason why some
> > requests got stuck:
> > 
> > Feb  7 15:16:26 ubuntu-vm kernel: BUG: unable to handle kernel NULL pointer 
> > dereference at   (null)
> > Feb  7 15:16:26 ubuntu-vm kernel: IP: scsi_times_out+0x17/0x2c0 [scsi_mod]
> > Feb  7 15:16:26 ubuntu-vm kernel: PGD 0 P4D 0  
> > Feb  7 15:16:26 ubuntu-vm kernel: Oops:  [#1] PREEMPT SMP
> > Feb  7 15:16:26 ubuntu-vm kernel: Modules linked in: dm_service_time ib_srp 
> > libcrc32c crc32c_generic scsi_transport_srp target_core_pscsi 
> > target_core_file ib_srpt target_core_iblock
> > target_core_mod
> > rdma_rxe ip6_udp_tunnel udp_tunnel ib_umad ib_uverbs scsi_debug brd 
> > mq_deadline kyber_iosched deadline_iosched cfq_iosched bfq crct10dif_pclmul 
> > crc32_pclmul af_packet ghash_clmulni_intel pcbc
> > aesni_intel aes_x86_64 crypto_simd cryptd glue_helper serio_raw 
> > virtio_console virtio_balloon sg button i2c_piix4 dm_multipath dm_mod dax 
> > scsi_dh_rdac scsi_dh_emc scsi_dh_alua ib_iser rdma_cm
> > iw_cm
> > ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi 
> > ip_tables x_tables ipv6 autofs4 ext4 crc16 mbcache jbd2 sd_mod virtio_net 
> > virtio_blk virtio_scsi sr_mod cdrom
> > ata_generic
> > pata_acpi psmouse crc32c_intel i2c_core atkbd
> > Feb  7 15:16:26 ubuntu-vm kernel: uhci_hcd ehci_hcd intel_agp ata_piix 
> > intel_gtt agpgart libata virtio_pci usbcore virtio_ring virtio scsi_mod 
> > usb_common unix
> > Feb  7 15:16:26 ubuntu-vm kernel: CPU: 0 PID: 146 Comm: kworker/0:1H Not 
> > tainted 4.15.0-dbg+ #1
> > Feb  7 15:16:26 ubuntu-vm kernel: Hardware name: QEMU Standard PC (i440FX + 
> > PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
> > Feb  7 15:16:26 ubuntu-vm kernel: Workqueue: kblockd blk_mq_timeout_work
> > Feb  7 15:16:26 ubuntu-vm kernel: RIP: 0010:scsi_times_out+0x17/0x2c0 
> > [scsi_mod]
> > Feb  7 15:16:26 ubuntu-vm kernel: RSP: 0018:98f0c02abd58 EFLAGS: 
> > 00010246
> > Feb  7 15:16:26 ubuntu-vm kernel: RAX:  RBX: 
> > 965de2b3a2c0 RCX: 
> > Feb  7 15:16:26 ubuntu-vm kernel: RDX: c0094740 RSI: 
> >  RDI: 965de2b3a2c0
> > Feb  7 15:16:26 ubuntu-vm kernel: RBP: 965de2b3a438 R08: 
> > fffc R09: 0007
> > Feb  7 15:16:26 ubuntu-vm kernel: R10:  R11: 
> >  R12: 0002
> > Feb  7 15:16:26 ubuntu-vm kernel: R13:  R14: 
> > 965de2a44218 R15: 965de2a48728
> > Feb  7 15:16:26 ubuntu-vm kernel: FS:  () 
> > GS:965dffc0() knlGS:
> > Feb  7 15:16:26 ubuntu-vm kernel: CS:  0010 DS:  ES:  CR0: 
> > 80050033
> > Feb  7 15:16:26 ubuntu-vm kernel: CR2:  CR3: 
> > 3ce0f003 CR4: 003606f0
> > Feb  7 15:16:26 ubuntu-vm kernel: DR0:  DR1: 
> >  DR2: 
> > Feb  7 15:16:26 ubuntu-vm kernel: DR3:  DR6: 
> > fffe0ff0 DR7: 0400
> > Feb  7 15:16:26 ubuntu-vm kernel: Call Trace:
> > Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_terminate_expired+0x42/0x80
> > Feb  7 15:16:26 ubuntu-vm kernel: bt_iter+0x3d/0x50
> > Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_queue_tag_busy_iter+0xe9/0x200
> > Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_timeout_work+0x181/0x2e0
> > Feb  7 15:16:26 ubuntu-vm kernel: process_one_work+0x21c/0x6d0
> > Feb  7 15:16:26 ubuntu-vm kernel: worker_thread+0x35/0x380
> > Feb  7 15:16:26 ubuntu-vm kernel: kthread+0x117/0x130
> > Feb  7 15:16:26 ubuntu-vm kernel: ret_from_fork+0x24/0x30
> > Feb  7 15:16:26 ubuntu-vm kernel: Code: ff ff 0f ff e9 fd fe ff ff 90 66 2e 
> > 0f 1f 84 00 00 00 00 00 41 55 41 54 55 48 8d af 78 01 00 00 53 48 8b 87 b0 
> > 01 00 00 48 89 fb <4c> 8b 28 0f 1f 44 00 00
> > 65
> > 8b 05 6a b5 f8 3f 83 f8 0f 0f 87 ed  
> > Feb  7 15:16:26 ubuntu-vm kernel: RIP: scsi_times_out+0x17/0x2c0 [scsi_mod] 
> > RSP: 98f0c02abd58
> > Feb  7 15:16:26 ubuntu-vm kernel: CR2: 
> > Feb  7 15:16:26 ubuntu-vm kernel: ---[ end trace ce6c20d95bab450e ]---
> 
> So, that's dereferencing %rax which is NULL.  That gotta be the ->host
> deref in the following.
> 
>   enum blk_eh_timer_return scsi_times_out(struct request *req)
>   {
> struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
> enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
> struct Scsi_Host *host = scmd->device->host;
>   ...
> 
> That sounds more like a scsi hotplug bug than an issue in the timeout
> code unless we messed up @req pointer to begin with.

I 

Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread t...@kernel.org
On Thu, Feb 08, 2018 at 07:39:40AM -0800, t...@kernel.org wrote:
> That sounds more like a scsi hotplug but than an issue in the timeout
   ^bug
> code unless we messed up @req pointer to begin with.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-08 Thread t...@kernel.org
On Thu, Feb 08, 2018 at 01:09:57AM +, Bart Van Assche wrote:
> On Wed, 2018-02-07 at 23:48 +, Bart Van Assche wrote:
> > With this patch applied I see requests for which it seems like the timeout 
> > handler
> > did not get invoked: [ ... ]
> 
> I just noticed the following in the system log, which is probably the reason 
> why some
> requests got stuck:
> 
> Feb  7 15:16:26 ubuntu-vm kernel: BUG: unable to handle kernel NULL pointer 
> dereference at   (null)
> Feb  7 15:16:26 ubuntu-vm kernel: IP: scsi_times_out+0x17/0x2c0 [scsi_mod]
> Feb  7 15:16:26 ubuntu-vm kernel: PGD 0 P4D 0  
> Feb  7 15:16:26 ubuntu-vm kernel: Oops:  [#1] PREEMPT SMP
> Feb  7 15:16:26 ubuntu-vm kernel: Modules linked in: dm_service_time ib_srp 
> libcrc32c crc32c_generic scsi_transport_srp target_core_pscsi 
> target_core_file ib_srpt target_core_iblock target_core_mod
> rdma_rxe ip6_udp_tunnel udp_tunnel ib_umad ib_uverbs scsi_debug brd 
> mq_deadline kyber_iosched deadline_iosched cfq_iosched bfq crct10dif_pclmul 
> crc32_pclmul af_packet ghash_clmulni_intel pcbc
> aesni_intel aes_x86_64 crypto_simd cryptd glue_helper serio_raw 
> virtio_console virtio_balloon sg button i2c_piix4 dm_multipath dm_mod dax 
> scsi_dh_rdac scsi_dh_emc scsi_dh_alua ib_iser rdma_cm iw_cm
> ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi 
> ip_tables x_tables ipv6 autofs4 ext4 crc16 mbcache jbd2 sd_mod virtio_net 
> virtio_blk virtio_scsi sr_mod cdrom ata_generic
> pata_acpi psmouse crc32c_intel i2c_core atkbd
> Feb  7 15:16:26 ubuntu-vm kernel: uhci_hcd ehci_hcd intel_agp ata_piix 
> intel_gtt agpgart libata virtio_pci usbcore virtio_ring virtio scsi_mod 
> usb_common unix
> Feb  7 15:16:26 ubuntu-vm kernel: CPU: 0 PID: 146 Comm: kworker/0:1H Not 
> tainted 4.15.0-dbg+ #1
> Feb  7 15:16:26 ubuntu-vm kernel: Hardware name: QEMU Standard PC (i440FX + 
> PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
> Feb  7 15:16:26 ubuntu-vm kernel: Workqueue: kblockd blk_mq_timeout_work
> Feb  7 15:16:26 ubuntu-vm kernel: RIP: 0010:scsi_times_out+0x17/0x2c0 
> [scsi_mod]
> Feb  7 15:16:26 ubuntu-vm kernel: RSP: 0018:98f0c02abd58 EFLAGS: 00010246
> Feb  7 15:16:26 ubuntu-vm kernel: RAX:  RBX: 965de2b3a2c0 
> RCX: 
> Feb  7 15:16:26 ubuntu-vm kernel: RDX: c0094740 RSI:  
> RDI: 965de2b3a2c0
> Feb  7 15:16:26 ubuntu-vm kernel: RBP: 965de2b3a438 R08: fffc 
> R09: 0007
> Feb  7 15:16:26 ubuntu-vm kernel: R10:  R11:  
> R12: 0002
> Feb  7 15:16:26 ubuntu-vm kernel: R13:  R14: 965de2a44218 
> R15: 965de2a48728
> Feb  7 15:16:26 ubuntu-vm kernel: FS:  () 
> GS:965dffc0() knlGS:
> Feb  7 15:16:26 ubuntu-vm kernel: CS:  0010 DS:  ES:  CR0: 
> 80050033
> Feb  7 15:16:26 ubuntu-vm kernel: CR2:  CR3: 3ce0f003 
> CR4: 003606f0
> Feb  7 15:16:26 ubuntu-vm kernel: DR0:  DR1:  
> DR2: 
> Feb  7 15:16:26 ubuntu-vm kernel: DR3:  DR6: fffe0ff0 
> DR7: 0400
> Feb  7 15:16:26 ubuntu-vm kernel: Call Trace:
> Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_terminate_expired+0x42/0x80
> Feb  7 15:16:26 ubuntu-vm kernel: bt_iter+0x3d/0x50
> Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_queue_tag_busy_iter+0xe9/0x200
> Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_timeout_work+0x181/0x2e0
> Feb  7 15:16:26 ubuntu-vm kernel: process_one_work+0x21c/0x6d0
> Feb  7 15:16:26 ubuntu-vm kernel: worker_thread+0x35/0x380
> Feb  7 15:16:26 ubuntu-vm kernel: kthread+0x117/0x130
> Feb  7 15:16:26 ubuntu-vm kernel: ret_from_fork+0x24/0x30
> Feb  7 15:16:26 ubuntu-vm kernel: Code: ff ff 0f ff e9 fd fe ff ff 90 66 2e 
> 0f 1f 84 00 00 00 00 00 41 55 41 54 55 48 8d af 78 01 00 00 53 48 8b 87 b0 01 
> 00 00 48 89 fb <4c> 8b 28 0f 1f 44 00 00 65
> 8b 05 6a b5 f8 3f 83 f8 0f 0f 87 ed  
> Feb  7 15:16:26 ubuntu-vm kernel: RIP: scsi_times_out+0x17/0x2c0 [scsi_mod] 
> RSP: 98f0c02abd58
> Feb  7 15:16:26 ubuntu-vm kernel: CR2: 
> Feb  7 15:16:26 ubuntu-vm kernel: ---[ end trace ce6c20d95bab450e ]---

So, that's dereferencing %rax which is NULL.  That gotta be the ->host
deref in the following.

  enum blk_eh_timer_return scsi_times_out(struct request *req)
  {
  struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
  enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
  struct Scsi_Host *host = scmd->device->host;
  ...

That sounds more like a scsi hotplug but than an issue in the timeout
code unless we messed up @req pointer to begin with.

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Bart Van Assche
On Wed, 2018-02-07 at 23:48 +, Bart Van Assche wrote:
> With this patch applied I see requests for which it seems like the timeout 
> handler
> did not get invoked: [ ... ]

I just noticed the following in the system log, which is probably the reason 
why some
requests got stuck:

Feb  7 15:16:26 ubuntu-vm kernel: BUG: unable to handle kernel NULL pointer 
dereference at   (null)
Feb  7 15:16:26 ubuntu-vm kernel: IP: scsi_times_out+0x17/0x2c0 [scsi_mod]
Feb  7 15:16:26 ubuntu-vm kernel: PGD 0 P4D 0  
Feb  7 15:16:26 ubuntu-vm kernel: Oops:  [#1] PREEMPT SMP
Feb  7 15:16:26 ubuntu-vm kernel: Modules linked in: dm_service_time ib_srp 
libcrc32c crc32c_generic scsi_transport_srp target_core_pscsi target_core_file 
ib_srpt target_core_iblock target_core_mod
rdma_rxe ip6_udp_tunnel udp_tunnel ib_umad ib_uverbs scsi_debug brd mq_deadline 
kyber_iosched deadline_iosched cfq_iosched bfq crct10dif_pclmul crc32_pclmul 
af_packet ghash_clmulni_intel pcbc
aesni_intel aes_x86_64 crypto_simd cryptd glue_helper serio_raw virtio_console 
virtio_balloon sg button i2c_piix4 dm_multipath dm_mod dax scsi_dh_rdac 
scsi_dh_emc scsi_dh_alua ib_iser rdma_cm iw_cm
ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi 
ip_tables x_tables ipv6 autofs4 ext4 crc16 mbcache jbd2 sd_mod virtio_net 
virtio_blk virtio_scsi sr_mod cdrom ata_generic
pata_acpi psmouse crc32c_intel i2c_core atkbd
Feb  7 15:16:26 ubuntu-vm kernel: uhci_hcd ehci_hcd intel_agp ata_piix 
intel_gtt agpgart libata virtio_pci usbcore virtio_ring virtio scsi_mod 
usb_common unix
Feb  7 15:16:26 ubuntu-vm kernel: CPU: 0 PID: 146 Comm: kworker/0:1H Not 
tainted 4.15.0-dbg+ #1
Feb  7 15:16:26 ubuntu-vm kernel: Hardware name: QEMU Standard PC (i440FX + 
PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
Feb  7 15:16:26 ubuntu-vm kernel: Workqueue: kblockd blk_mq_timeout_work
Feb  7 15:16:26 ubuntu-vm kernel: RIP: 0010:scsi_times_out+0x17/0x2c0 [scsi_mod]
Feb  7 15:16:26 ubuntu-vm kernel: RSP: 0018:98f0c02abd58 EFLAGS: 00010246
Feb  7 15:16:26 ubuntu-vm kernel: RAX:  RBX: 965de2b3a2c0 
RCX: 
Feb  7 15:16:26 ubuntu-vm kernel: RDX: c0094740 RSI:  
RDI: 965de2b3a2c0
Feb  7 15:16:26 ubuntu-vm kernel: RBP: 965de2b3a438 R08: fffc 
R09: 0007
Feb  7 15:16:26 ubuntu-vm kernel: R10:  R11:  
R12: 0002
Feb  7 15:16:26 ubuntu-vm kernel: R13:  R14: 965de2a44218 
R15: 965de2a48728
Feb  7 15:16:26 ubuntu-vm kernel: FS:  () 
GS:965dffc0() knlGS:
Feb  7 15:16:26 ubuntu-vm kernel: CS:  0010 DS:  ES:  CR0: 
80050033
Feb  7 15:16:26 ubuntu-vm kernel: CR2:  CR3: 3ce0f003 
CR4: 003606f0
Feb  7 15:16:26 ubuntu-vm kernel: DR0:  DR1:  
DR2: 
Feb  7 15:16:26 ubuntu-vm kernel: DR3:  DR6: fffe0ff0 
DR7: 0400
Feb  7 15:16:26 ubuntu-vm kernel: Call Trace:
Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_terminate_expired+0x42/0x80
Feb  7 15:16:26 ubuntu-vm kernel: bt_iter+0x3d/0x50
Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_queue_tag_busy_iter+0xe9/0x200
Feb  7 15:16:26 ubuntu-vm kernel: blk_mq_timeout_work+0x181/0x2e0
Feb  7 15:16:26 ubuntu-vm kernel: process_one_work+0x21c/0x6d0
Feb  7 15:16:26 ubuntu-vm kernel: worker_thread+0x35/0x380
Feb  7 15:16:26 ubuntu-vm kernel: kthread+0x117/0x130
Feb  7 15:16:26 ubuntu-vm kernel: ret_from_fork+0x24/0x30
Feb  7 15:16:26 ubuntu-vm kernel: Code: ff ff 0f ff e9 fd fe ff ff 90 66 2e 0f 
1f 84 00 00 00 00 00 41 55 41 54 55 48 8d af 78 01 00 00 53 48 8b 87 b0 01 00 
00 48 89 fb <4c> 8b 28 0f 1f 44 00 00 65
8b 05 6a b5 f8 3f 83 f8 0f 0f 87 ed  
Feb  7 15:16:26 ubuntu-vm kernel: RIP: scsi_times_out+0x17/0x2c0 [scsi_mod] 
RSP: 98f0c02abd58
Feb  7 15:16:26 ubuntu-vm kernel: CR2: 
Feb  7 15:16:26 ubuntu-vm kernel: ---[ end trace ce6c20d95bab450e ]---

Bart.








Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Bart Van Assche
On Wed, 2018-02-07 at 12:07 -0800, t...@kernel.org wrote:
> Ah, you're right.  u64_stat_sync doesn't imply barriers, so we want
> something like the following.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index df93102..d6edf3b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -593,7 +593,7 @@ static void blk_mq_rq_update_aborted_gstate(struct 
> request *rq, u64 gstate)
>*/
>   local_irq_save(flags);
>   u64_stats_update_begin(>aborted_gstate_sync);
> - rq->aborted_gstate = gstate;
> + smp_store_release(>aborted_gstate, gstate);
>   u64_stats_update_end(>aborted_gstate_sync);
>   local_irq_restore(flags);
>  }
> @@ -605,7 +605,7 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
>  
>   do {
>   start = u64_stats_fetch_begin(>aborted_gstate_sync);
> - aborted_gstate = rq->aborted_gstate;
> + aborted_gstate = smp_load_acquire(>aborted_gstate);
>   } while (u64_stats_fetch_retry(>aborted_gstate_sync, start));
>  
>   return aborted_gstate;
> @@ -836,8 +836,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
> reserved)
>* ->aborted_gstate is set, this may lead to ignored
>* completions and further spurious timeouts.
>*/
> - blk_mq_rq_update_aborted_gstate(req, 0);
>   blk_add_timer(req);
> + blk_mq_rq_update_aborted_gstate(req, 0);
>   break;
>   case BLK_EH_NOT_HANDLED:
>   break;

Hello Tejun,

With this patch applied I see requests for which it seems like the timeout 
handler
did not get invoked:

sdc/hctx0/busy:95e04b7c {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT|MQ_TIMEOUT_EXPIRED, .state=i
n_flight, .gstate=0xed/0xed, .tag=26, .internal_tag=-1, .cmd=Write(10) 2a 00 00 
 
00 60 ba 00 00 08 00, .retries=0, .result = 0x5, .flags=TAGGED|INITIALIZED, 
 
.timeout=1.000, allocated 1093.180 s ago}

sdc/hctx0/busy:65a64e9b {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT|MQ_TIMEOUT_EXPIRED, .state=i
n_flight, .gstate=0x5/0x5, .tag=27, .internal_tag=-1, .cmd=Write(10) 2a 00 00 00
 62 d2 00 00 08 00, .retries=0, .result = 0x5, .flags=TAGGED|INITIALIZED, .t
imeout=1.000, allocated 1093.180 s ago}

[ ... ]

sdc/hctx3/busy:479cc2a9 {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT, .state=in_flight, .gstate=0
x11/0x11, .tag=57, .internal_tag=-1, .cmd=Write(10) 2a 00 00 00 61 d2 00 00 08 0
0, .retries=0, .result = 0x0, .flags=TAGGED|INITIALIZED, .timeout=1.000, allocat
ed 1093.150 s ago}

sdc/hctx3/busy:8fd130d5 {.op=WRITE, .cmd_flags=FAILFAST_TRANSPORT|SYNC|N
OMERGE|IDLE, .rq_flags=MQ_INFLIGHT|DONTPREP|IO_STAT, .state=in_flight, .gstate=0
xd/0xd, .tag=61, .internal_tag=-1, .cmd=Write(10) 2a 00 00 00 c3 94 00 00 08 00,
 .retries=0, .result = 0x0, .flags=TAGGED|INITIALIZED, .timeout=1.000, allocated
 1093.140 s ago}

As one can see for some requests MQ_TIMEOUT_EXPIRED is set and .result = 
0x5.
The value of .result means that the SCSI error handler has submitted an abort 
(see
also scmnd->result = DID_ABORT << 16 in drivers/infiniband/ulp/srp/ib_srp.c). 
For
the last two requests shown above however MQ_TIMEOUT_EXPIRED is not set and the
SCSI result has value 0. I think that means that it can happen that a request
times out but that the timeout handler does not get invoked ...

Thanks,

Bart.





Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread t...@kernel.org
Hello,

On Wed, Feb 07, 2018 at 09:02:21PM +, Bart Van Assche wrote:
> The patch that I used in my test had an smp_wmb() call (see also below). 
> Anyway,
> I will see whether I can extract more state information through debugfs.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ef4f6df0f1df..8eb2105d82b7 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -827,13 +827,9 @@ static void blk_mq_rq_timed_out(struct request *req, 
> bool reserved)
>   __blk_mq_complete_request(req);
>   break;
>   case BLK_EH_RESET_TIMER:
> - /*
> -  * As nothing prevents from completion happening while
> -  * ->aborted_gstate is set, this may lead to ignored
> -  * completions and further spurious timeouts.
> -  */
> - blk_mq_rq_update_aborted_gstate(req, 0);
>   blk_add_timer(req);
> + smp_wmb();
> + blk_mq_rq_update_aborted_gstate(req, 0);

Without the matching rmb, just adding rmb won't do much but given the
default strong ordering on x86 and other operations around, what you
were seeing is probably not caused by lack of barriers.

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Bart Van Assche
On Wed, 2018-02-07 at 12:09 -0800, t...@kernel.org wrote:
> Hello,
> 
> On Wed, Feb 07, 2018 at 07:03:56PM +, Bart Van Assche wrote:
> > I tried the above patch but already during the first iteration of the test I
> > noticed that the test hung, probably due to the following request that got 
> > stuck:
> > 
> > $ (cd /sys/kernel/debug/block && grep -aH . */*/*/rq_list)
> > a98cff60 {.op=SCSI_IN, .cmd_flags=, 
> > .rq_flags=MQ_INFLIGHT|PREEMPT|QUIET|IO_STAT|PM,
> >  .state=idle, .tag=22, .internal_tag=-1, .cmd=Synchronize Cache(10) 35 00 
> > 00 00 00 00, .retries=0,
> >  .result = 0x0, .flags=TAGGED, .timeout=60.000, allocated 872.690 s ago}
> 
> I'm wonder how this happened, so we can lose a completion when it
> races against BLK_EH_RESET_TIMER; however, the command should timeout
> later cuz the timer is running again now.  Maybe we actually had the
> memory barrier race that you pointed out in the other message?

Hello Tejun,

The patch that I used in my test had an smp_wmb() call (see also below). Anyway,
I will see whether I can extract more state information through debugfs.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ef4f6df0f1df..8eb2105d82b7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -827,13 +827,9 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
reserved)
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
-   /*
-* As nothing prevents from completion happening while
-* ->aborted_gstate is set, this may lead to ignored
-* completions and further spurious timeouts.
-*/
-   blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
+   smp_wmb();
+   blk_mq_rq_update_aborted_gstate(req, 0);
break;
case BLK_EH_NOT_HANDLED:
break;





Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread t...@kernel.org
Hello,

On Wed, Feb 07, 2018 at 07:03:56PM +, Bart Van Assche wrote:
> I tried the above patch but already during the first iteration of the test I
> noticed that the test hung, probably due to the following request that got 
> stuck:
> 
> $ (cd /sys/kernel/debug/block && grep -aH . */*/*/rq_list)
> a98cff60 {.op=SCSI_IN, .cmd_flags=, 
> .rq_flags=MQ_INFLIGHT|PREEMPT|QUIET|IO_STAT|PM,
>  .state=idle, .tag=22, .internal_tag=-1, .cmd=Synchronize Cache(10) 35 00 00 
> 00 00 00, .retries=0,
>  .result = 0x0, .flags=TAGGED, .timeout=60.000, allocated 872.690 s ago}

I'm wonder how this happened, so we can lose a completion when it
races against BLK_EH_RESET_TIMER; however, the command should timeout
later cuz the timer is running again now.  Maybe we actually had the
memory barrier race that you pointed out in the other message?

Thanks.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread t...@kernel.org
Hello, Bart.

On Wed, Feb 07, 2018 at 06:14:13PM +, Bart Van Assche wrote:
> When I wrote my comment I was not sure whether or not non-reentrancy is
> guaranteed for work queue items. However, according to what I found in the
> workqueue implementation I think that is guaranteed. So it shouldn't be
> possible that the timer activated by blk_add_timer() gets handled before
> aborted_gstate is reset. But since the timeout handler and completion

Yeah, we're basically single threaded in the timeout path.

> handlers can be executed by different CPUs, shouldn't a memory barrier be
> inserted between the blk_add_timer() call and resetting aborted_gsync to
> guarantee that a completion cannot occur before blk_add_timer() has reset
> RQF_MQ_TIMEOUT_EXPIRED?

Ah, you're right.  u64_stat_sync doesn't imply barriers, so we want
something like the following.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index df93102..d6edf3b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -593,7 +593,7 @@ static void blk_mq_rq_update_aborted_gstate(struct request 
*rq, u64 gstate)
 */
local_irq_save(flags);
u64_stats_update_begin(>aborted_gstate_sync);
-   rq->aborted_gstate = gstate;
+   smp_store_release(>aborted_gstate, gstate);
u64_stats_update_end(>aborted_gstate_sync);
local_irq_restore(flags);
 }
@@ -605,7 +605,7 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
 
do {
start = u64_stats_fetch_begin(>aborted_gstate_sync);
-   aborted_gstate = rq->aborted_gstate;
+   aborted_gstate = smp_load_acquire(>aborted_gstate);
} while (u64_stats_fetch_retry(>aborted_gstate_sync, start));
 
return aborted_gstate;
@@ -836,8 +836,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
reserved)
 * ->aborted_gstate is set, this may lead to ignored
 * completions and further spurious timeouts.
 */
-   blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
+   blk_mq_rq_update_aborted_gstate(req, 0);
break;
case BLK_EH_NOT_HANDLED:
break;


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Bart Van Assche
On Wed, 2018-02-07 at 09:06 -0800, Tejun Heo wrote:
> Can you see whether by any chance the following patch fixes the issue?
> If not, can you share the repro case?
> 
> Thanks.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index df93102..651d18c 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -836,8 +836,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
> reserved)
>* ->aborted_gstate is set, this may lead to ignored
>* completions and further spurious timeouts.
>*/
> - blk_mq_rq_update_aborted_gstate(req, 0);
>   blk_add_timer(req);
> + blk_mq_rq_update_aborted_gstate(req, 0);
>   break;
>   case BLK_EH_NOT_HANDLED:
>   break;

Hello Tejun,

I tried the above patch but already during the first iteration of the test I
noticed that the test hung, probably due to the following request that got 
stuck:

$ (cd /sys/kernel/debug/block && grep -aH . */*/*/rq_list)
a98cff60 {.op=SCSI_IN, .cmd_flags=, 
.rq_flags=MQ_INFLIGHT|PREEMPT|QUIET|IO_STAT|PM,
 .state=idle, .tag=22, .internal_tag=-1, .cmd=Synchronize Cache(10) 35 00 00 00 
00 00, .retries=0,
 .result = 0x0, .flags=TAGGED, .timeout=60.000, allocated 872.690 s ago}

Thanks,

Bart.




Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Bart Van Assche
On Wed, 2018-02-07 at 09:35 -0800, t...@kernel.org wrote:
> On Wed, Feb 07, 2018 at 05:27:10PM +, Bart Van Assche wrote:
> > Even with the above change I think that there is still a race between the
> > code that handles timer resets and the completion handler.
> 
> Can you elaborate the scenario a bit further?  If you're referring to
> lost completions, we've always had that and while we can try to close
> that hole too, I don't think it's a critical issue.

Hello Tejun,

When I wrote my comment I was not sure whether or not non-reentrancy is
guaranteed for work queue items. However, according to what I found in the
workqueue implementation I think that is guaranteed. So it shouldn't be
possible that the timer activated by blk_add_timer() gets handled before
aborted_gstate is reset. But since the timeout handler and completion
handlers can be executed by different CPUs, shouldn't a memory barrier be
inserted between the blk_add_timer() call and resetting aborted_gsync to
guarantee that a completion cannot occur before blk_add_timer() has reset
RQF_MQ_TIMEOUT_EXPIRED?

Thanks,

Bart.




Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread t...@kernel.org
Hello, Bart.

On Wed, Feb 07, 2018 at 05:27:10PM +, Bart Van Assche wrote:
> Even with the above change I think that there is still a race between the
> code that handles timer resets and the completion handler. Anyway, the test

Can you elaborate the scenario a bit further?  If you're referring to
lost completions, we've always had that and while we can try to close
that hole too, I don't think it's a critical issue.

> with which I triggered these races is as follows:
> - Start from what will become kernel v4.16-rc1 and apply the patch that adds
>   SRP over RoCE support to the ib_srpt driver. See also the "[PATCH v2 00/14]
>   IB/srpt: Add RDMA/CM support" patch series
>   (https://www.spinics.net/lists/linux-rdma/msg59589.html).
> - Apply my patch series that fixes a race between the SCSI error handler and
>   SCSI transport recovery.
> - Apply my patch series that improves the stability of the SCSI target core
>   (LIO).
> - Build and install that kernel.
> - Clone the following repository: https://github.com/bvanassche/srp-test.
> - Run the following test:
>   while true; do srp-test/run_tests -c -t 02-mq; done
> - While the test is running, check whether or not something weird happens.
>   Sometimes I see that scsi_times_out() crashes. Sometimes I see while running
>   this test that a soft lockup is reported inside blk_mq_do_dispatch_ctx().
> 
> If you want I can share the tree on github that I use myself for my tests.

Heh, that's quite a bit.  I'll take up on that git tree later but for
now I'd really appreciate if you can test the patch.

Thank you very much.

-- 
tejun


Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Bart Van Assche
On Wed, 2018-02-07 at 09:06 -0800, Tejun Heo wrote:
> On Tue, Feb 06, 2018 at 05:11:33PM -0800, Bart Van Assche wrote:
> > The following race can occur between the code that resets the timer
> > and completion handling:
> > - The code that handles BLK_EH_RESET_TIMER resets aborted_gstate.
> > - A completion occurs and blk_mq_complete_request() calls
> >   __blk_mq_complete_request().
> > - The timeout code calls blk_add_timer() and that function sets the
> >   request deadline and adjusts the timer.
> > - __blk_mq_complete_request() frees the request tag.
> > - The timer fires and the timeout handler gets called for a freed
> >   request.
> 
> Can you see whether by any chance the following patch fixes the issue?
> If not, can you share the repro case?
> 
> Thanks.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index df93102..651d18c 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -836,8 +836,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
> reserved)
>* ->aborted_gstate is set, this may lead to ignored
>* completions and further spurious timeouts.
>*/
> - blk_mq_rq_update_aborted_gstate(req, 0);
>   blk_add_timer(req);
> + blk_mq_rq_update_aborted_gstate(req, 0);
>   break;
>   case BLK_EH_NOT_HANDLED:
>   break;

Hello Tejun,

Even with the above change I think that there is still a race between the
code that handles timer resets and the completion handler. Anyway, the test
with which I triggered these races is as follows:
- Start from what will become kernel v4.16-rc1 and apply the patch that adds
  SRP over RoCE support to the ib_srpt driver. See also the "[PATCH v2 00/14]
  IB/srpt: Add RDMA/CM support" patch series
  (https://www.spinics.net/lists/linux-rdma/msg59589.html).
- Apply my patch series that fixes a race between the SCSI error handler and
  SCSI transport recovery.
- Apply my patch series that improves the stability of the SCSI target core
  (LIO).
- Build and install that kernel.
- Clone the following repository: https://github.com/bvanassche/srp-test.
- Run the following test:
  while true; do srp-test/run_tests -c -t 02-mq; done
- While the test is running, check whether or not something weird happens.
  Sometimes I see that scsi_times_out() crashes. Sometimes I see while running
  this test that a soft lockup is reported inside blk_mq_do_dispatch_ctx().

If you want I can share the tree on github that I use myself for my tests.

Thanks,

Bart.

Re: [PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-07 Thread Tejun Heo
Hello, Bart.

On Tue, Feb 06, 2018 at 05:11:33PM -0800, Bart Van Assche wrote:
> The following race can occur between the code that resets the timer
> and completion handling:
> - The code that handles BLK_EH_RESET_TIMER resets aborted_gstate.
> - A completion occurs and blk_mq_complete_request() calls
>   __blk_mq_complete_request().
> - The timeout code calls blk_add_timer() and that function sets the
>   request deadline and adjusts the timer.
> - __blk_mq_complete_request() frees the request tag.
> - The timer fires and the timeout handler gets called for a freed
>   request.

Can you see whether by any chance the following patch fixes the issue?
If not, can you share the repro case?

Thanks.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index df93102..651d18c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -836,8 +836,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
reserved)
 * ->aborted_gstate is set, this may lead to ignored
 * completions and further spurious timeouts.
 */
-   blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
+   blk_mq_rq_update_aborted_gstate(req, 0);
break;
case BLK_EH_NOT_HANDLED:
break;


[PATCH v2] blk-mq: Fix race between resetting the timer and completion handling

2018-02-06 Thread Bart Van Assche
The following race can occur between the code that resets the timer
and completion handling:
- The code that handles BLK_EH_RESET_TIMER resets aborted_gstate.
- A completion occurs and blk_mq_complete_request() calls
  __blk_mq_complete_request().
- The timeout code calls blk_add_timer() and that function sets the
  request deadline and adjusts the timer.
- __blk_mq_complete_request() frees the request tag.
- The timer fires and the timeout handler gets called for a freed
  request.

Since I have not found any approach to fix this race that does not
require the use of atomic instructions to manipulate the request state,
fix this race as follows:
- Introduce a spinlock to protecte request state and deadline changes.
- Use the deadline instead of the request generation to detect whether
  or not a request timer got fired after reinitialization of a request.
- Store the request state in the lowest two bits of the deadline instead
  of the lowest to bits of 'gstate'.
- Remove all request member variables that became superfluous due to
  this change: gstate, aborted_gstate, gstate_seq and aborted_gstate_sync.
- Remove the state information that became superfluous due to this patch,
  namely RQF_MQ_TIMEOUT_EXPIRED.
- Remove the code that became superfluous due to this change, namely
  the RCU lock and unlock statements in blk_mq_complete_request() and
  also the synchronize_rcu() call in the timeout handler.

This patch fixes the following kernel crash:

BUG: unable to handle kernel NULL pointer dereference at   (null)
Oops:  [#1] PREEMPT SMP
CPU: 2 PID: 151 Comm: kworker/2:1H Tainted: GW4.15.0-dbg+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.0.0-prebuilt.qemu-project.org 04/01/2014
Workqueue: kblockd blk_mq_timeout_work
RIP: 0010:scsi_times_out+0x17/0x2c0 [scsi_mod]
Call Trace:
blk_mq_terminate_expired+0x42/0x80
bt_iter+0x3d/0x50
blk_mq_queue_tag_busy_iter+0xe9/0x200
blk_mq_timeout_work+0x181/0x2e0
process_one_work+0x21c/0x6d0
worker_thread+0x35/0x380
kthread+0x117/0x130
ret_from_fork+0x24/0x30

Fixes: 1d9bd5161ba3 ("blk-mq: replace timeout synchronization with a RCU and 
generation based scheme")
Signed-off-by: Bart Van Assche 
Cc: Tejun Heo 
---
 block/blk-core.c   |   3 +-
 block/blk-mq-debugfs.c |   1 -
 block/blk-mq.c | 178 +++--
 block/blk-mq.h |  25 ++-
 block/blk-timeout.c|   1 -
 block/blk.h|   4 +-
 include/linux/blkdev.h |  28 ++--
 7 files changed, 53 insertions(+), 187 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d0d104268f1a..2e3cf04f12a8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,8 +126,7 @@ void blk_rq_init(struct request_queue *q, struct request 
*rq)
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
-   seqcount_init(>gstate_seq);
-   u64_stats_init(>aborted_gstate_sync);
+   spin_lock_init(>state_lock);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 21cbc1f071c6..43084d64aa41 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -290,7 +290,6 @@ static const char *const rqf_name[] = {
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
-   RQF_NAME(MQ_TIMEOUT_EXPIRED),
RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index df93102e2149..e17a2536ac50 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -309,7 +309,6 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
rq->special = NULL;
/* tag was already set */
rq->extra_len = 0;
-   rq->__deadline = 0;
 
INIT_LIST_HEAD(>timeout_list);
rq->timeout = 0;
@@ -531,8 +530,7 @@ static void __blk_mq_complete_request(struct request *rq)
bool shared = false;
int cpu;
 
-   WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
-   blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+   WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_COMPLETE);
 
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
@@ -581,34 +579,26 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int 
*srcu_idx)
*srcu_idx = srcu_read_lock(hctx->srcu);
 }
 
-static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+/**
+ * blk_mq_change_rq_state - atomically test and set request state
+ * @rq: Request pointer.
+ * @old: Old request state.
+ * @new: New request state.
+ */
+static bool blk_mq_change_rq_state(struct request *rq, enum mq_rq_state old,
+  enum mq_rq_state new)
 {
unsigned long flags;
+   bool changed_state = false;
 
-   /*
-* blk_mq_rq_aborted_gstate() is used from the completion path and
-* can thus be called from irq