[PATCH AUTOSEL 4.19 25/68] nvme-fc: resolve io failures during connect

2018-11-28 Thread Sasha Levin
From: James Smart 

[ Upstream commit 4cff280a5fccf6513ed9e895bb3a4e7ad8b0cedc ]

If an io error occurs on an io issued while connecting, recovery
of the io falls flat as the state checking ends up nooping the error
handler.

Create an err_work work item that is scheduled upon an io error while
connecting. The work thread terminates all io on all queues and marks
the queues as not connected.  The termination of the io will return
back to the callee, which will then back out of the connection attempt
and will reschedule, if possible, the connection attempt.

The changes:
- in case there are several commands hitting the error handler, a
  state flag is kept so that the error work is only scheduled once,
  on the first error. The subsequent errors can be ignored.
- The calling sequence to stop keep alive and terminate the queues
  and their io is lifted from the reset routine. Made a small
  service routine used by both reset and err_work.
- During debugging, found that the teardown path can reference
  an uninitialized pointer, resulting in a NULL pointer oops.
  The aen_ops weren't initialized yet. Add validation on their
  initialization before calling the teardown routine.

Signed-off-by: James Smart 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Sasha Levin 
---
 drivers/nvme/host/fc.c | 73 --
 1 file changed, 63 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 611e70cae754..9375fa705d82 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -144,6 +144,7 @@ struct nvme_fc_ctrl {
 
boolioq_live;
boolassoc_active;
+   atomic_terr_work_active;
u64 association_id;
 
struct list_headctrl_list;  /* rport->ctrl_list */
@@ -152,6 +153,7 @@ struct nvme_fc_ctrl {
struct blk_mq_tag_set   tag_set;
 
struct delayed_work connect_work;
+   struct work_struct  err_work;
 
struct kref ref;
u32 flags;
@@ -1523,6 +1525,10 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
int i;
 
+   /* ensure we've initialized the ops once */
+   if (!(aen_op->flags & FCOP_FLAGS_AEN))
+   return;
+
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++)
__nvme_fc_abort_op(ctrl, aen_op);
 }
@@ -2036,7 +2042,25 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
 static void
 nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
 {
-   /* only proceed if in LIVE state - e.g. on first error */
+   int active;
+
+   /*
+* if an error (io timeout, etc) while (re)connecting,
+* it's an error on creating the new association.
+* Start the error recovery thread if it hasn't already
+* been started. It is expected there could be multiple
+* ios hitting this path before things are cleaned up.
+*/
+   if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
+   active = atomic_xchg(>err_work_active, 1);
+   if (!active && !schedule_work(>err_work)) {
+   atomic_set(>err_work_active, 0);
+   WARN_ON(1);
+   }
+   return;
+   }
+
+   /* Otherwise, only proceed if in LIVE state - e.g. on first error */
if (ctrl->ctrl.state != NVME_CTRL_LIVE)
return;
 
@@ -2802,6 +2826,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 {
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
+   cancel_work_sync(>err_work);
cancel_delayed_work_sync(>connect_work);
/*
 * kill the association on the link side.  this will block
@@ -2854,23 +2879,30 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, 
int status)
 }
 
 static void
-nvme_fc_reset_ctrl_work(struct work_struct *work)
+__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
 {
-   struct nvme_fc_ctrl *ctrl =
-   container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
-   int ret;
-
-   nvme_stop_ctrl(>ctrl);
+   nvme_stop_keep_alive(>ctrl);
 
/* will block will waiting for io to terminate */
nvme_fc_delete_association(ctrl);
 
-   if (!nvme_change_ctrl_state(>ctrl, NVME_CTRL_CONNECTING)) {
+   if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
+   !nvme_change_ctrl_state(>ctrl, NVME_CTRL_CONNECTING))
dev_err(ctrl->ctrl.device,
"NVME-FC{%d}: error_recovery: Couldn't change state "
"to CONNECTING\n", ctrl->cnum);
-   return;
-   }
+}
+
+static void
+nvme_fc_reset_ctrl_work(struct work_struct *work)
+{
+   struct nvme_fc_ctrl *ctrl =
+   container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
+   int ret;
+
+   

[PATCH AUTOSEL 4.19 25/68] nvme-fc: resolve io failures during connect

2018-11-28 Thread Sasha Levin
From: James Smart 

[ Upstream commit 4cff280a5fccf6513ed9e895bb3a4e7ad8b0cedc ]

If an io error occurs on an io issued while connecting, recovery
of the io falls flat as the state checking ends up nooping the error
handler.

Create an err_work work item that is scheduled upon an io error while
connecting. The work thread terminates all io on all queues and marks
the queues as not connected.  The termination of the io will return
back to the callee, which will then back out of the connection attempt
and will reschedule, if possible, the connection attempt.

The changes:
- in case there are several commands hitting the error handler, a
  state flag is kept so that the error work is only scheduled once,
  on the first error. The subsequent errors can be ignored.
- The calling sequence to stop keep alive and terminate the queues
  and their io is lifted from the reset routine. Made a small
  service routine used by both reset and err_work.
- During debugging, found that the teardown path can reference
  an uninitialized pointer, resulting in a NULL pointer oops.
  The aen_ops weren't initialized yet. Add validation on their
  initialization before calling the teardown routine.

Signed-off-by: James Smart 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Sasha Levin 
---
 drivers/nvme/host/fc.c | 73 --
 1 file changed, 63 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 611e70cae754..9375fa705d82 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -144,6 +144,7 @@ struct nvme_fc_ctrl {
 
boolioq_live;
boolassoc_active;
+   atomic_terr_work_active;
u64 association_id;
 
struct list_headctrl_list;  /* rport->ctrl_list */
@@ -152,6 +153,7 @@ struct nvme_fc_ctrl {
struct blk_mq_tag_set   tag_set;
 
struct delayed_work connect_work;
+   struct work_struct  err_work;
 
struct kref ref;
u32 flags;
@@ -1523,6 +1525,10 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
int i;
 
+   /* ensure we've initialized the ops once */
+   if (!(aen_op->flags & FCOP_FLAGS_AEN))
+   return;
+
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++)
__nvme_fc_abort_op(ctrl, aen_op);
 }
@@ -2036,7 +2042,25 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
 static void
 nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
 {
-   /* only proceed if in LIVE state - e.g. on first error */
+   int active;
+
+   /*
+* if an error (io timeout, etc) while (re)connecting,
+* it's an error on creating the new association.
+* Start the error recovery thread if it hasn't already
+* been started. It is expected there could be multiple
+* ios hitting this path before things are cleaned up.
+*/
+   if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
+   active = atomic_xchg(>err_work_active, 1);
+   if (!active && !schedule_work(>err_work)) {
+   atomic_set(>err_work_active, 0);
+   WARN_ON(1);
+   }
+   return;
+   }
+
+   /* Otherwise, only proceed if in LIVE state - e.g. on first error */
if (ctrl->ctrl.state != NVME_CTRL_LIVE)
return;
 
@@ -2802,6 +2826,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 {
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
+   cancel_work_sync(>err_work);
cancel_delayed_work_sync(>connect_work);
/*
 * kill the association on the link side.  this will block
@@ -2854,23 +2879,30 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, 
int status)
 }
 
 static void
-nvme_fc_reset_ctrl_work(struct work_struct *work)
+__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
 {
-   struct nvme_fc_ctrl *ctrl =
-   container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
-   int ret;
-
-   nvme_stop_ctrl(>ctrl);
+   nvme_stop_keep_alive(>ctrl);
 
/* will block will waiting for io to terminate */
nvme_fc_delete_association(ctrl);
 
-   if (!nvme_change_ctrl_state(>ctrl, NVME_CTRL_CONNECTING)) {
+   if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
+   !nvme_change_ctrl_state(>ctrl, NVME_CTRL_CONNECTING))
dev_err(ctrl->ctrl.device,
"NVME-FC{%d}: error_recovery: Couldn't change state "
"to CONNECTING\n", ctrl->cnum);
-   return;
-   }
+}
+
+static void
+nvme_fc_reset_ctrl_work(struct work_struct *work)
+{
+   struct nvme_fc_ctrl *ctrl =
+   container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
+   int ret;
+
+