When one req is timed out, now nvme_timeout() handles it by the
following way:

        nvme_dev_disable(dev, false);
        nvme_reset_ctrl(&dev->ctrl);
        return BLK_EH_HANDLED.

which may introduces the following issues:

1) the following timeout on other reqs may call nvme_dev_disable()
again, which may quiesce queue again when resetting is in-progress,
then finally nothing can move on.

2) when timeout is triggered in reset work function, nvme_wait_freeze()
may wait forever because now controller can't be recovered at all

This patch fixes the issues by:

1) handle timeout event in one EH thread, and wakeup this thread if
controller recovery is needed

2) Inside the EH handler, timeout work is drained by nvme_unquiesce_timeout()
before canceling in-flight requrests, so all requests can be canceled or
completed by either EH or block timeout handler.

3) Moves the draining IO and unfreezing queues into one post_eh work context,
so controller can be recovered always.

This patch fixes reports from the horible test of block/011.

Cc: Jianchao Wang <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Sagi Grimberg <[email protected]>
Cc: [email protected]
Signed-off-by: Ming Lei <[email protected]>
---
 drivers/nvme/host/core.c |  22 ++++++++
 drivers/nvme/host/nvme.h |   3 +
 drivers/nvme/host/pci.c  | 140 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 155 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f9028873298e..664a268accd1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3582,6 +3582,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       down_read(&ctrl->namespaces_rwsem);
+       list_for_each_entry(ns, &ctrl->namespaces, list)
+               blk_unquiesce_timeout(ns->queue);
+       up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       down_read(&ctrl->namespaces_rwsem);
+       list_for_each_entry(ns, &ctrl->namespaces, list)
+               blk_quiesce_timeout(ns->queue);
+       up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a19b7f04ac24..956ee19ff403 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,6 +19,7 @@
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
+#include <linux/blkdev.h>
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
@@ -405,6 +406,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void 
*buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
                union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8172ee584130..86c14d9051ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -29,6 +29,7 @@
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sed-opal.h>
+#include <linux/kthread.h>
 
 #include "nvme.h"
 
@@ -112,6 +113,11 @@ struct nvme_dev {
        dma_addr_t host_mem_descs_dma;
        struct nvme_host_mem_buf_desc *host_mem_descs;
        void **host_mem_desc_bufs;
+
+       spinlock_t        eh_lock;
+       bool            eh_in_recovery;
+       struct task_struct    *ehandler;
+       struct work_struct post_eh_work;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1176,6 +1182,100 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 
csts)
                         csts, result);
 }
 
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+       spin_lock(&dev->eh_lock);
+       if (!dev->eh_in_recovery) {
+               dev->eh_in_recovery = true;
+               wake_up_process(dev->ehandler);
+       }
+       spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_done(struct nvme_dev *dev)
+{
+       spin_lock(&dev->eh_lock);
+       if (dev->eh_in_recovery)
+               dev->eh_in_recovery = false;
+       spin_unlock(&dev->eh_lock);
+}
+
+static int nvme_error_handler(void *data)
+{
+       struct nvme_dev *dev = data;
+
+       while (true) {
+               /*
+                * The sequence in kthread_stop() sets the stop flag first
+                * then wakes the process.  To avoid missed wakeups, the task
+                * should always be in a non running state before the stop
+                * flag is checked
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread_should_stop())
+                       break;
+
+               spin_lock(&dev->eh_lock);
+               if (!dev->eh_in_recovery) {
+                       spin_unlock(&dev->eh_lock);
+                       schedule();
+                       continue;
+               }
+               spin_unlock(&dev->eh_lock);
+
+               __set_current_state(TASK_RUNNING);
+
+               dev_info(dev->ctrl.device, "start eh recovery\n");
+
+               /* freeze queues before recovery */
+               nvme_start_freeze(&dev->ctrl);
+
+               nvme_dev_disable(dev, false);
+
+               /*
+                * reset won't wait for IO completion any more, so it
+                * is safe to reset controller in sync way
+                */
+               nvme_reset_ctrl_sync(&dev->ctrl);
+
+               dev_info(dev->ctrl.device, "eh recovery done\n");
+
+               /*
+                * drain IO & unfreeze queues in another context because
+                * these IOs may trigger timeout again
+                */
+               queue_work(nvme_reset_wq, &dev->post_eh_work);
+       }
+       __set_current_state(TASK_RUNNING);
+       dev->ehandler = NULL;
+
+       return 0;
+}
+
+static void nvme_post_eh_work(struct work_struct *work)
+{
+       struct nvme_dev *dev =
+               container_of(work, struct nvme_dev, post_eh_work);
+
+       nvme_wait_freeze(&dev->ctrl);
+       nvme_unfreeze(&dev->ctrl);
+}
+
+static int nvme_eh_init(struct nvme_dev *dev)
+{
+       spin_lock_init(&dev->eh_lock);
+       INIT_WORK(&dev->post_eh_work, nvme_post_eh_work);
+
+       dev->ehandler = kthread_run(nvme_error_handler, dev,
+                       "nvme_eh_%d", dev->ctrl.instance);
+       if (IS_ERR(dev->ehandler)) {
+               dev_err(dev->ctrl.device, "error handler thread failed to 
spawn, error = %ld\n",
+                       PTR_ERR(dev->ehandler));
+               return PTR_ERR(dev->ehandler);
+       }
+       return 0;
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
reserved)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1197,8 +1297,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
         */
        if (nvme_should_reset(dev, csts)) {
                nvme_warn_reset(dev, csts);
-               nvme_dev_disable(dev, false);
-               nvme_reset_ctrl(&dev->ctrl);
+               nvme_eh_schedule(dev);
                return BLK_EH_HANDLED;
        }
 
@@ -1224,8 +1323,8 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
                dev_warn(dev->ctrl.device,
                         "I/O %d QID %d timeout, disable controller\n",
                         req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false);
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+               nvme_eh_schedule(dev);
                return BLK_EH_HANDLED;
        default:
                break;
@@ -1240,14 +1339,12 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
                dev_warn(dev->ctrl.device,
                         "I/O %d QID %d timeout, reset controller\n",
                         req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false);
-               nvme_reset_ctrl(&dev->ctrl);
-
                /*
                 * Mark the request as handled, since the inline shutdown
                 * forces all outstanding requests to complete.
                 */
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+               nvme_eh_schedule(dev);
                return BLK_EH_HANDLED;
        }
 
@@ -2246,8 +2343,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
        if (pci_is_enabled(pdev)) {
                u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-               if (dev->ctrl.state == NVME_CTRL_LIVE ||
-                   dev->ctrl.state == NVME_CTRL_RESETTING) {
+               if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE ||
+                   dev->ctrl.state == NVME_CTRL_RESETTING)) {
                        nvme_start_freeze(&dev->ctrl);
                        frozen = true;
                }
@@ -2281,11 +2378,23 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
        for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
                nvme_suspend_queue(&dev->queues[i]);
 
+       /* safe to sync timeout after queues are quiesced */
+       nvme_quiesce_timeout(&dev->ctrl);
+       blk_quiesce_timeout(dev->ctrl.admin_q);
+
        nvme_pci_disable(dev);
 
+       /*
+        * Both timeout and interrupt handler have been drained, and all
+        * in-flight requests will be canceled now.
+        */
        blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
        blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, 
&dev->ctrl);
 
+       /* all requests are canceled now, so enable timeout now */
+       nvme_unquiesce_timeout(&dev->ctrl);
+       blk_unquiesce_timeout(dev->ctrl.admin_q);
+
        /*
         * The driver will not be starting up queues again if shutting down so
         * must flush all entered requests to their failed completion to avoid
@@ -2416,9 +2525,14 @@ static void nvme_reset_work(struct work_struct *work)
        if (result)
                goto out;
 
+       nvme_eh_done(dev);
+
        /*
         * Keep the controller around but remove all namespaces if we don't have
         * any working I/O queue.
+        *
+        * Now we won't wait for IO completion here any more, and sync reset
+        * can be used safely anywhere.
         */
        if (dev->online_queues < 2) {
                dev_warn(dev->ctrl.device, "IO queues not created\n");
@@ -2427,11 +2541,9 @@ static void nvme_reset_work(struct work_struct *work)
                new_state = NVME_CTRL_ADMIN_ONLY;
        } else {
                nvme_start_queues(&dev->ctrl);
-               nvme_wait_freeze(&dev->ctrl);
                /* hit this only when allocate tagset fails */
                if (nvme_dev_add(dev))
                        new_state = NVME_CTRL_ADMIN_ONLY;
-               nvme_unfreeze(&dev->ctrl);
        }
 
        /*
@@ -2449,6 +2561,7 @@ static void nvme_reset_work(struct work_struct *work)
 
  out:
        nvme_remove_dead_ctrl(dev, result);
+       nvme_eh_done(dev);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2590,10 +2703,15 @@ static int nvme_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
 
        dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+       if (nvme_eh_init(dev))
+               goto uninit_ctrl;
+
        nvme_reset_ctrl(&dev->ctrl);
 
        return 0;
 
+ uninit_ctrl:
+       nvme_uninit_ctrl(&dev->ctrl);
  release_pools:
        nvme_release_prp_pools(dev);
  unmap:
@@ -2647,6 +2765,8 @@ static void nvme_remove(struct pci_dev *pdev)
        nvme_stop_ctrl(&dev->ctrl);
        nvme_remove_namespaces(&dev->ctrl);
        nvme_dev_disable(dev, true);
+       if (dev->ehandler)
+               kthread_stop(dev->ehandler);
        nvme_free_host_mem(dev);
        nvme_dev_remove_admin(dev);
        nvme_free_queues(dev, 0);
-- 
2.9.5

Reply via email to