Re: [PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-28 Thread Ming Lei
On Mon, May 28, 2018 at 01:44:25PM +0200, Christoph Hellwig wrote:
> On Thu, May 24, 2018 at 12:45:15PM +0800, Ming Lei wrote:
> > This change should have been done after '[PATCH 13/14] blk-mq: Remove
> > generation seqeunce', otherwise the timed-out request won't be completed
> > by nvme_cancel_request() at all because we always marked this request as
> > 'COMPLETE' before calling .timeout().
> 
> Yes.  Or we should start out with reverting the whole series introducing
> the gstate.  I think it has shown to create much more problems than it
> solved, and starting from a clean state without it will allow us to
> iterate much saner.

It may not help for this case by reverting the series introducing the gstate.

This behaviour has been used from the beginning, such as, the code
becomes the following after the revert:

if (!blk_mark_rq_complete(rq))
blk_mq_rq_timed_out(rq, reserved);

Given the patch 13("blk-mq: Remove generation seqeunce") is the 1st one
to change the behaviour, we should put this patch and other related ones
after patch 13 for avoiding to break 'git bisect'.

Thanks,
Ming


Re: [PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-28 Thread Christoph Hellwig
On Wed, May 23, 2018 at 03:27:35PM +0200, Hannes Reinecke wrote:
> Is there a way of _testing_ this patch?

Use the error injection framework Johannes wrote?  Use blktests
(but make sure you have a fixed version of test 011).

> It looks pretty dodgy, just replacing BLK_EH_HANDLED with 
> BLK_EH_NOT_HANDLED.
> And, as nothing else has changed, would imply that we can do it on 
> older/stable versions, too.
> Which means that the original code was buggy to start with.
> Hence a test here would be really beneficial.

NVMe always completed the requests itself, so not too much changed.



Re: [PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-28 Thread Christoph Hellwig
On Thu, May 24, 2018 at 12:45:15PM +0800, Ming Lei wrote:
> This change should have been done after '[PATCH 13/14] blk-mq: Remove
> generation seqeunce', otherwise the timed-out request won't be completed
> by nvme_cancel_request() at all because we always marked this request as
> 'COMPLETE' before calling .timeout().

Yes.  Or we should start out with reverting the whole series introducing
the gstate.  I think it has shown to create much more problems than it
solved, and starting from a clean state without it will allow us to
iterate much saner.


Re: [PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-23 Thread Ming Lei
On Wed, May 23, 2018 at 02:19:30PM +0200, Christoph Hellwig wrote:
> NVMe always completes the request before returning from ->timeout, either
> by polling for it, or by disabling the controller.  Return BLK_EH_DONE so
> that the block layer doesn't even try to complete it again.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/nvme/host/pci.c| 14 +-
>  drivers/nvme/host/rdma.c   |  2 +-
>  drivers/nvme/target/loop.c |  2 +-
>  3 files changed, 7 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 917e1714f7d9..31525324b79f 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1205,7 +1205,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
> request *req, bool reserved)
>   nvme_warn_reset(dev, csts);
>   nvme_dev_disable(dev, false);
>   nvme_reset_ctrl(>ctrl);
> - return BLK_EH_HANDLED;
> + return BLK_EH_DONE;
>   }
>  
>   /*
> @@ -1215,14 +1215,14 @@ static enum blk_eh_timer_return nvme_timeout(struct 
> request *req, bool reserved)
>   dev_warn(dev->ctrl.device,
>"I/O %d QID %d timeout, completion polled\n",
>req->tag, nvmeq->qid);
> - return BLK_EH_HANDLED;
> + return BLK_EH_DONE;
>   }
>  
>   /*
>* Shutdown immediately if controller times out while starting. The
>* reset work will see the pci device disabled when it gets the forced
>* cancellation error. All outstanding requests are completed on
> -  * shutdown, so we return BLK_EH_HANDLED.
> +  * shutdown, so we return BLK_EH_DONE.
>*/
>   switch (dev->ctrl.state) {
>   case NVME_CTRL_CONNECTING:
> @@ -1232,7 +1232,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
> request *req, bool reserved)
>req->tag, nvmeq->qid);
>   nvme_dev_disable(dev, false);
>   nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> - return BLK_EH_HANDLED;
> + return BLK_EH_DONE;
>   default:
>   break;
>   }
> @@ -1249,12 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct 
> request *req, bool reserved)
>   nvme_dev_disable(dev, false);
>   nvme_reset_ctrl(>ctrl);
>  
> - /*
> -  * Mark the request as handled, since the inline shutdown
> -  * forces all outstanding requests to complete.
> -  */
>   nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> - return BLK_EH_HANDLED;
> + return BLK_EH_DONE;
>   }
>  
>   if (atomic_dec_return(>ctrl.abort_limit) < 0) {
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 1eb4438a8763..ac7462cd7f0f 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1598,7 +1598,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
>   /* fail with DNR on cmd timeout */
>   nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
>  
> - return BLK_EH_HANDLED;
> + return BLK_EH_DONE;
>  }
>  
>  static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
> diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
> index 27a8561c0cb9..22e3627bf16b 100644
> --- a/drivers/nvme/target/loop.c
> +++ b/drivers/nvme/target/loop.c
> @@ -146,7 +146,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
>   /* fail with DNR on admin cmd timeout */
>   nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
>  
> - return BLK_EH_HANDLED;
> + return BLK_EH_DONE;
>  }
>  
>  static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> -- 
> 2.17.0
> 

This change should have been done after '[PATCH 13/14] blk-mq: Remove
generation seqeunce', otherwise the timed-out request won't be completed
by nvme_cancel_request() at all because we always marked this request as
'COMPLETE' before calling .timeout().

Thanks,
Ming


Re: [PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-23 Thread Hannes Reinecke

On 05/23/2018 02:19 PM, Christoph Hellwig wrote:

NVMe always completes the request before returning from ->timeout, either
by polling for it, or by disabling the controller.  Return BLK_EH_DONE so
that the block layer doesn't even try to complete it again.

Signed-off-by: Christoph Hellwig 
---
  drivers/nvme/host/pci.c| 14 +-
  drivers/nvme/host/rdma.c , |  2 +-
  drivers/nvme/target/loop.c |  2 +-
  3 files changed, 7 insertions(+), 11 deletions(-)


Is there a way of _testing_ this patch?
It looks pretty dodgy, just replacing BLK_EH_HANDLED with 
BLK_EH_NOT_HANDLED.
And, as nothing else has changed, would imply that we can do it on 
older/stable versions, too.

Which means that the original code was buggy to start with.
Hence a test here would be really beneficial.

But then, assuming you did some tests here:

Reviewed-by: Hannes Reinecke 

Cheers,

Hannes


Re: [PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-23 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


[PATCH 03/14] nvme: return BLK_EH_DONE from ->timeout

2018-05-23 Thread Christoph Hellwig
NVMe always completes the request before returning from ->timeout, either
by polling for it, or by disabling the controller.  Return BLK_EH_DONE so
that the block layer doesn't even try to complete it again.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/pci.c| 14 +-
 drivers/nvme/host/rdma.c   |  2 +-
 drivers/nvme/target/loop.c |  2 +-
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 917e1714f7d9..31525324b79f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1205,7 +1205,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
nvme_warn_reset(dev, csts);
nvme_dev_disable(dev, false);
nvme_reset_ctrl(>ctrl);
-   return BLK_EH_HANDLED;
+   return BLK_EH_DONE;
}
 
/*
@@ -1215,14 +1215,14 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
dev_warn(dev->ctrl.device,
 "I/O %d QID %d timeout, completion polled\n",
 req->tag, nvmeq->qid);
-   return BLK_EH_HANDLED;
+   return BLK_EH_DONE;
}
 
/*
 * Shutdown immediately if controller times out while starting. The
 * reset work will see the pci device disabled when it gets the forced
 * cancellation error. All outstanding requests are completed on
-* shutdown, so we return BLK_EH_HANDLED.
+* shutdown, so we return BLK_EH_DONE.
 */
switch (dev->ctrl.state) {
case NVME_CTRL_CONNECTING:
@@ -1232,7 +1232,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
 req->tag, nvmeq->qid);
nvme_dev_disable(dev, false);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-   return BLK_EH_HANDLED;
+   return BLK_EH_DONE;
default:
break;
}
@@ -1249,12 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
nvme_dev_disable(dev, false);
nvme_reset_ctrl(>ctrl);
 
-   /*
-* Mark the request as handled, since the inline shutdown
-* forces all outstanding requests to complete.
-*/
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-   return BLK_EH_HANDLED;
+   return BLK_EH_DONE;
}
 
if (atomic_dec_return(>ctrl.abort_limit) < 0) {
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1eb4438a8763..ac7462cd7f0f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1598,7 +1598,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
/* fail with DNR on cmd timeout */
nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
 
-   return BLK_EH_HANDLED;
+   return BLK_EH_DONE;
 }
 
 static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 27a8561c0cb9..22e3627bf16b 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -146,7 +146,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
/* fail with DNR on admin cmd timeout */
nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
 
-   return BLK_EH_HANDLED;
+   return BLK_EH_DONE;
 }
 
 static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
-- 
2.17.0