When one req is timed out, now nvme_timeout() handles it by the
following way:

        nvme_dev_disable(dev, false);
        nvme_reset_ctrl(&dev->ctrl);
        return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in reset
path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

- nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times.

- Also two reset work may run one by one, this may cause hang in
nvme_wait_freeze() forever too.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context, so the above race 2) can be fixed easily.

2) always start a new context for handling EH, and cancel all in-flight
requests(include the timed-out ones) in nvme_dev_disable() by quiescing
timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, removes
the controller and fails all in-flight request.

With this approach, blktest block/011 can be passed.

Cc: James Smart <james.sm...@broadcom.com>
Cc: Jianchao Wang <jianchao.w.w...@oracle.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Sagi Grimberg <s...@grimberg.me>
Cc: linux-n...@lists.infradead.org
Cc: Laurence Oberman <lober...@redhat.com>
Signed-off-by: Ming Lei <ming....@redhat.com>
---
 drivers/nvme/host/core.c |  22 +++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 252 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 256 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9e51c3e1f534..264619dc81db 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3587,6 +3587,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       down_read(&ctrl->namespaces_rwsem);
+       list_for_each_entry(ns, &ctrl->namespaces, list)
+               blk_unquiesce_timeout(ns->queue);
+       up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       down_read(&ctrl->namespaces_rwsem);
+       list_for_each_entry(ns, &ctrl->namespaces, list)
+               blk_quiesce_timeout(ns->queue);
+       up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 715239226f4c..5ed7d7ddd597 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -412,6 +412,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void 
*buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
                union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4236d79e3643..58e92c7c10e0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
                freeze_queue);
+static int nvme_reset_dev(struct nvme_dev *dev);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,23 @@ struct nvme_dev {
        dma_addr_t host_mem_descs_dma;
        struct nvme_host_mem_buf_desc *host_mem_descs;
        void **host_mem_desc_bufs;
+
+       /* EH handler */
+       spinlock_t      eh_lock;
+       bool            ctrl_shutdown_started;
+       bool            ctrl_failed;
+       unsigned int    nested_eh;
+       struct work_struct fail_ctrl_work;
+       wait_queue_head_t       eh_wq;
+       struct list_head        eh_head;
+};
+
+#define  NVME_MAX_NESTED_EH    32
+struct nvme_eh_work {
+       struct work_struct      work;
+       struct nvme_dev         *dev;
+       int                     seq;
+       struct list_head        list;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1186,6 +1204,183 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 
csts)
                         csts, result);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+       dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", 
status);
+
+       nvme_get_ctrl(&dev->ctrl);
+       nvme_dev_disable(dev, false, false);
+       if (!queue_work(nvme_wq, &dev->remove_work))
+               nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+       struct nvme_dev *dev =
+               container_of(work, struct nvme_dev, fail_ctrl_work);
+
+       dev_info(dev->ctrl.device, "EH: fail controller\n");
+       nvme_remove_dead_ctrl(dev, 0);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+       spin_lock(&dev->eh_lock);
+       dev->ctrl_shutdown_started = false;
+       spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_sched_fail_ctrl(struct nvme_dev *dev)
+{
+       INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+       queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+}
+
+/* either controller is updated to LIVE or will be removed */
+static bool nvme_eh_reset_done(struct nvme_dev *dev)
+{
+       return dev->ctrl.state == NVME_CTRL_LIVE ||
+               dev->ctrl.state == NVME_CTRL_ADMIN_ONLY ||
+               dev->ctrl_failed;
+}
+
+static void nvme_eh_done(struct nvme_eh_work *eh_work, int result)
+{
+       struct nvme_dev *dev = eh_work->dev;
+       bool top_eh;
+
+       spin_lock(&dev->eh_lock);
+       top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+
+       /* Fail controller if the top EH can't recover it */
+       if (!result)
+               wake_up_all(&dev->eh_wq);
+       else if (top_eh) {
+               dev->ctrl_failed = true;
+               nvme_eh_sched_fail_ctrl(dev);
+               wake_up_all(&dev->eh_wq);
+       }
+
+       list_del(&eh_work->list);
+       spin_unlock(&dev->eh_lock);
+
+       dev_info(dev->ctrl.device, "EH %d: state %d, eh_done %d, top eh %d\n",
+                       eh_work->seq, dev->ctrl.state, result, top_eh);
+       wait_event(dev->eh_wq, nvme_eh_reset_done(dev));
+
+       /* release the EH seq, so outer EH can be allocated bigger seq No. */
+       spin_lock(&dev->eh_lock);
+       dev->nested_eh--;
+       spin_unlock(&dev->eh_lock);
+
+       /*
+        * After controller is recovered in upper EH finally, we have to
+        * unfreeze queues if reset failed in this EH, otherwise blk-mq
+        * queues' freeze counter may be leaked.
+        *
+        * nvme_unfreeze() can only be called after controller state is
+        * updated to LIVE.
+        */
+       if (result && (dev->ctrl.state == NVME_CTRL_LIVE ||
+                               dev->ctrl.state == NVME_CTRL_ADMIN_ONLY))
+               nvme_unfreeze(&dev->ctrl);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+       struct nvme_eh_work *eh_work =
+               container_of(work, struct nvme_eh_work, work);
+       struct nvme_dev *dev = eh_work->dev;
+       int result = -ENODEV;
+       bool top_eh;
+
+       dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+                       eh_work->seq);
+       nvme_dev_disable(dev, false, true);
+
+       /* allow new EH to be created */
+       nvme_eh_mark_ctrl_shutdown(dev);
+
+       /*
+        * nvme_dev_disable cancels all in-flight requests, and wont't
+        * cause timout at all, so I am always the top EH now, but it
+        * becomes not true after 'reset_lock' is held, so have to check
+        * if I am still the top EH, and force to update to NVME_CTRL_RESETTING
+        * if yes.
+        */
+       mutex_lock(&dev->ctrl.reset_lock);
+       spin_lock(&dev->eh_lock);
+
+       /* allow top EH to preempt other inner EH */
+       top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+       dev_info(dev->ctrl.device, "EH %d: after shutdown, top eh: %d\n",
+                       eh_work->seq, top_eh);
+       if (!top_eh) {
+               if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+                       spin_unlock(&dev->eh_lock);
+                       goto done;
+               }
+       } else {
+               nvme_force_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
+               result = 0;
+       }
+
+       spin_unlock(&dev->eh_lock);
+       result = nvme_reset_dev(dev);
+done:
+       mutex_unlock(&dev->ctrl.reset_lock);
+       nvme_eh_done(eh_work, result);
+       dev_info(dev->ctrl.device, "EH %d: after recovery %d\n",
+                       eh_work->seq, result);
+
+       kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+       bool need_sched = false;
+       bool fail_ctrl = false;
+       struct nvme_eh_work *eh_work;
+       int seq;
+
+       spin_lock(&dev->eh_lock);
+       if (!dev->ctrl_shutdown_started) {
+               need_sched = true;
+               seq = dev->nested_eh;
+               if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+                       if (!dev->ctrl_failed)
+                               dev->ctrl_failed = fail_ctrl = true;
+                       else
+                               need_sched = false;
+               } else
+                       dev->ctrl_shutdown_started = true;
+       }
+       spin_unlock(&dev->eh_lock);
+
+       if (!need_sched)
+               return;
+
+       if (fail_ctrl) {
+ fail_ctrl:
+               nvme_eh_sched_fail_ctrl(dev);
+               return;
+       }
+
+       eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+       if (!eh_work)
+               goto fail_ctrl;
+
+       eh_work->dev = dev;
+       eh_work->seq = seq;
+
+       spin_lock(&dev->eh_lock);
+       list_add_tail(&eh_work->list, &dev->eh_head);
+       spin_unlock(&dev->eh_lock);
+
+       INIT_WORK(&eh_work->work, nvme_eh_work);
+       queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
reserved)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1207,9 +1402,8 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
         */
        if (nvme_should_reset(dev, csts)) {
                nvme_warn_reset(dev, csts);
-               nvme_dev_disable(dev, false, true);
-               nvme_reset_ctrl(&dev->ctrl);
-               return BLK_EH_HANDLED;
+               nvme_eh_schedule(dev);
+               return BLK_EH_RESET_TIMER;
        }
 
        /*
@@ -1234,9 +1428,9 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
                dev_warn(dev->ctrl.device,
                         "I/O %d QID %d timeout, disable controller\n",
                         req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false, false);
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-               return BLK_EH_HANDLED;
+               nvme_eh_schedule(dev);
+               return BLK_EH_RESET_TIMER;
        default:
                break;
        }
@@ -1250,15 +1444,13 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
                dev_warn(dev->ctrl.device,
                         "I/O %d QID %d timeout, reset controller\n",
                         req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false, true);
-               nvme_reset_ctrl(&dev->ctrl);
-
                /*
                 * Mark the request as handled, since the inline shutdown
                 * forces all outstanding requests to complete.
                 */
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-               return BLK_EH_HANDLED;
+               nvme_eh_schedule(dev);
+               return BLK_EH_RESET_TIMER;
        }
 
        if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2316,12 +2508,28 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown, bool
        }
        for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
                nvme_suspend_queue(&dev->queues[i]);
+       /*
+        * safe to sync timeout after queues are quiesced, then all
+        * requests(include the time-out ones) will be canceled.
+        */
+       nvme_quiesce_timeout(&dev->ctrl);
+       if (dev->ctrl.admin_q)
+               blk_quiesce_timeout(dev->ctrl.admin_q);
 
        nvme_pci_disable(dev);
 
+       /*
+        * Both timeout and interrupt handler have been drained, and all
+        * in-flight requests will be canceled now.
+        */
        blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
        blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, 
&dev->ctrl);
 
+       /* all requests have been canceled now, so enable timeout now */
+       nvme_unquiesce_timeout(&dev->ctrl);
+       if (dev->ctrl.admin_q)
+               blk_unquiesce_timeout(dev->ctrl.admin_q);
+
        /*
         * The driver will not be starting up queues again if shutting down so
         * must flush all entered requests to their failed completion to avoid
@@ -2381,16 +2589,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
        kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
-{
-       dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", 
status);
-
-       nvme_get_ctrl(&dev->ctrl);
-       nvme_dev_disable(dev, false, false);
-       if (!queue_work(nvme_wq, &dev->remove_work))
-               nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset_dev(struct nvme_dev *dev)
 {
        bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
@@ -2400,7 +2598,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
        lockdep_assert_held(&dev->ctrl.reset_lock);
 
-       if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+       if (dev->ctrl.state != NVME_CTRL_RESETTING)
                goto out;
 
        /*
@@ -2486,6 +2684,10 @@ static int nvme_reset_dev(struct nvme_dev *dev)
                unfreeze_queue = true;
        }
 
+       /* controller state may have been updated already by inner EH */
+       if (dev->ctrl.state == new_state)
+               goto reset_done;
+
        result = -ENODEV;
        /*
         * If only admin queue live, keep it to do further investigation or
@@ -2499,6 +2701,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
 
        nvme_start_ctrl(&dev->ctrl);
 
+ reset_done:
        if (unfreeze_queue)
                nvme_unfreeze(&dev->ctrl);
        return 0;
@@ -2615,6 +2818,13 @@ static unsigned long check_vendor_combination_bug(struct 
pci_dev *pdev)
        return 0;
 }
 
+static void nvme_eh_init(struct nvme_dev *dev)
+{
+       spin_lock_init(&dev->eh_lock);
+       init_waitqueue_head(&dev->eh_wq);
+       INIT_LIST_HEAD(&dev->eh_head);
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
        int node, result = -ENOMEM;
@@ -2659,6 +2869,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
 
        dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+       nvme_eh_init(dev);
+
        nvme_reset_ctrl(&dev->ctrl);
 
        return 0;
-- 
2.9.5

Reply via email to