dmesg reports virtqueue failure and device reset:
virtio_pmem virtio2: failed to send command to
virtio pmem device, no free slots in the virtqueue
virtio_pmem virtio2: virtio pmem device
needs a reset

virtio_pmem_flush() can wait for a free virtqueue descriptor (-ENOSPC).
It can also wait for host completion. If the request virtqueue breaks,
those waiters may never make progress. One example is notify failure from
virtqueue_kick().

Track a device-level broken state and converge the failure to -EIO. New
requests fail fast, -ENOSPC waiters are unlinked and woken, and the
currently submitted request is woken so its host_acked waiter can return
without waiting forever for host completion. Completed requests are forced
to report an error after the queue is marked broken.

Do not detach unused buffers from an active virtqueue. Runtime
broken-queue handling only stops new submissions and wakes local waiters.
Removal resets the device first. It then drains request tokens. After
that, the device no longer owns the buffers when the virtqueue reference
is dropped.

Closes: https://lore.kernel.org/r/[email protected]/
Signed-off-by: Li Chen <[email protected]>
---
Changes in v6:
- Wake the in-flight host-completion waiter when marking the queue broken.
- Track req_inflight and clear it on completion/drain paths.
- Return -EIO if the queue breaks before a host response is observed.
Changes in v5:
- Split broken marking from token draining.
- Do not call virtqueue_detach_unused_buf() on an active queue.
- Reset the device before draining tokens in remove().
- Do not let the host-completion wait return only because the device is
  marked broken.
v2->v3:
- Add raw dmesg excerpt to the patch description.
- Drop timestamps from the embedded dmesg.
- Fold the CONFIG_VIRTIO_PMEM=m export fix into this patch.
v3->v4:
- Rebased onto v7.1-rc7 and renumbered after the flush error patches.
- Use kmalloc_obj(*req_data) at the allocation site to match current nvdimm
  code.

 drivers/nvdimm/nd_virtio.c   | 117 +++++++++++++++++++++++++++++++----
 drivers/nvdimm/virtio_pmem.c |  15 ++++-
 drivers/nvdimm/virtio_pmem.h |   8 +++
 3 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 35d36bd36a526..fb9391ebc46e7 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -30,6 +30,12 @@ static bool virtio_pmem_req_done(struct virtio_pmem_request 
*req)
        return smp_load_acquire(&req->done);
 }
 
+static void virtio_pmem_complete_err(struct virtio_pmem_request *req)
+{
+       req->resp.ret = cpu_to_le32(1);
+       virtio_pmem_signal_done(req);
+}
+
 static void virtio_pmem_wake_one_waiter(struct virtio_pmem *vpmem)
 {
        struct virtio_pmem_request *req_buf;
@@ -44,6 +50,63 @@ static void virtio_pmem_wake_one_waiter(struct virtio_pmem 
*vpmem)
        wake_up(&req_buf->wq_buf);
 }
 
+static void virtio_pmem_wake_all_waiters(struct virtio_pmem *vpmem)
+{
+       struct virtio_pmem_request *req, *tmp;
+
+       list_for_each_entry_safe(req, tmp, &vpmem->req_list, list) {
+               list_del_init(&req->list);
+               WRITE_ONCE(req->wq_buf_avail, true);
+               wake_up(&req->wq_buf);
+       }
+}
+
+static void virtio_pmem_clear_inflight(struct virtio_pmem *vpmem,
+                                      struct virtio_pmem_request *req)
+{
+       if (vpmem->req_inflight == req)
+               vpmem->req_inflight = NULL;
+}
+
+static void virtio_pmem_wake_inflight(struct virtio_pmem *vpmem)
+{
+       struct virtio_pmem_request *req = vpmem->req_inflight;
+
+       if (req)
+               wake_up(&req->host_acked);
+}
+
+void virtio_pmem_mark_broken(struct virtio_pmem *vpmem)
+{
+       if (!READ_ONCE(vpmem->broken)) {
+               WRITE_ONCE(vpmem->broken, true);
+               dev_err_once(&vpmem->vdev->dev, "virtqueue is broken\n");
+       }
+
+       virtio_pmem_wake_inflight(vpmem);
+       virtio_pmem_wake_all_waiters(vpmem);
+}
+EXPORT_SYMBOL_GPL(virtio_pmem_mark_broken);
+
+void virtio_pmem_drain(struct virtio_pmem *vpmem)
+{
+       struct virtio_pmem_request *req;
+       unsigned int len;
+
+       while ((req = virtqueue_get_buf(vpmem->req_vq, &len)) != NULL) {
+               virtio_pmem_clear_inflight(vpmem, req);
+               virtio_pmem_complete_err(req);
+               kref_put(&req->kref, virtio_pmem_req_release);
+       }
+
+       while ((req = virtqueue_detach_unused_buf(vpmem->req_vq)) != NULL) {
+               virtio_pmem_clear_inflight(vpmem, req);
+               virtio_pmem_complete_err(req);
+               kref_put(&req->kref, virtio_pmem_req_release);
+       }
+}
+EXPORT_SYMBOL_GPL(virtio_pmem_drain);
+
  /* The interrupt handler */
 void virtio_pmem_host_ack(struct virtqueue *vq)
 {
@@ -54,8 +117,12 @@ void virtio_pmem_host_ack(struct virtqueue *vq)
 
        spin_lock_irqsave(&vpmem->pmem_lock, flags);
        while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) {
+               virtio_pmem_clear_inflight(vpmem, req_data);
                virtio_pmem_wake_one_waiter(vpmem);
-               virtio_pmem_signal_done(req_data);
+               if (READ_ONCE(vpmem->broken))
+                       virtio_pmem_complete_err(req_data);
+               else
+                       virtio_pmem_signal_done(req_data);
                kref_put(&req_data->kref, virtio_pmem_req_release);
        }
        spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
@@ -83,6 +150,9 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
                return -EIO;
        }
 
+       if (READ_ONCE(vpmem->broken))
+               return -EIO;
+
        req_data = kmalloc_obj(*req_data, GFP_NOIO);
        if (!req_data)
                return -ENOMEM;
@@ -99,13 +169,18 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
        sgs[1] = &ret;
 
        spin_lock_irqsave(&vpmem->pmem_lock, flags);
-        /*
-         * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
-         * queue does not have free descriptor. We add the request
-         * to req_list and wait for host_ack to wake us up when free
-         * slots are available.
-         */
+       /*
+        * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
+        * queue does not have free descriptor. We add the request
+        * to req_list and wait for host_ack to wake us up when free
+        * slots are available.
+        */
        for (;;) {
+               if (READ_ONCE(vpmem->broken)) {
+                       err = -EIO;
+                       break;
+               }
+
                err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data,
                                        GFP_ATOMIC);
                if (!err) {
@@ -114,6 +189,7 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
                         * held so completion cannot run concurrently.
                         */
                        kref_get(&req_data->kref);
+                       vpmem->req_inflight = req_data;
                        break;
                }
 
@@ -127,24 +203,41 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
                spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
 
                /* A host response results in "host_ack" getting called */
-               wait_event(req_data->wq_buf, READ_ONCE(req_data->wq_buf_avail));
+               wait_event(req_data->wq_buf,
+                          READ_ONCE(req_data->wq_buf_avail) ||
+                          READ_ONCE(vpmem->broken));
                spin_lock_irqsave(&vpmem->pmem_lock, flags);
+
+               if (READ_ONCE(vpmem->broken))
+                       break;
        }
 
-       err1 = virtqueue_kick(vpmem->req_vq);
+       if (err == -EIO || virtqueue_is_broken(vpmem->req_vq))
+               virtio_pmem_mark_broken(vpmem);
+
+       err1 = true;
+       if (!err && !READ_ONCE(vpmem->broken)) {
+               err1 = virtqueue_kick(vpmem->req_vq);
+               if (!err1)
+                       virtio_pmem_mark_broken(vpmem);
+       }
        spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
        /*
         * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
         * do anything about that.
         */
-       if (err || !err1) {
+       if (READ_ONCE(vpmem->broken) || err || !err1) {
                dev_info(&vdev->dev, "failed to send command to virtio pmem 
device\n");
                err = -EIO;
        } else {
                /* A host response results in "host_ack" getting called */
                wait_event(req_data->host_acked,
-                          virtio_pmem_req_done(req_data));
-               err = le32_to_cpu(req_data->resp.ret);
+                          virtio_pmem_req_done(req_data) ||
+                          READ_ONCE(vpmem->broken));
+               if (virtio_pmem_req_done(req_data))
+                       err = le32_to_cpu(req_data->resp.ret);
+               else
+                       err = -EIO;
        }
 
        kref_put(&req_data->kref, virtio_pmem_req_release);
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
index 77b1966619059..b272e9279ef23 100644
--- a/drivers/nvdimm/virtio_pmem.c
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -25,6 +25,8 @@ static int init_vq(struct virtio_pmem *vpmem)
 
        spin_lock_init(&vpmem->pmem_lock);
        INIT_LIST_HEAD(&vpmem->req_list);
+       vpmem->req_inflight = NULL;
+       WRITE_ONCE(vpmem->broken, false);
 
        return 0;
 };
@@ -138,10 +140,21 @@ static int virtio_pmem_probe(struct virtio_device *vdev)
 static void virtio_pmem_remove(struct virtio_device *vdev)
 {
        struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
+       struct virtio_pmem *vpmem = vdev->priv;
+       unsigned long flags;
+
+       spin_lock_irqsave(&vpmem->pmem_lock, flags);
+       virtio_pmem_mark_broken(vpmem);
+       spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+
+       virtio_reset_device(vdev);
+
+       spin_lock_irqsave(&vpmem->pmem_lock, flags);
+       virtio_pmem_drain(vpmem);
+       spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
 
        nvdimm_bus_unregister(nvdimm_bus);
        vdev->config->del_vqs(vdev);
-       virtio_reset_device(vdev);
 }
 
 static int virtio_pmem_freeze(struct virtio_device *vdev)
diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
index 23bff40249c1b..bc7de2b328985 100644
--- a/drivers/nvdimm/virtio_pmem.h
+++ b/drivers/nvdimm/virtio_pmem.h
@@ -52,6 +52,12 @@ struct virtio_pmem {
        /* List to store deferred work if virtqueue is full */
        struct list_head req_list;
 
+       /* Request currently owned by the virtqueue. */
+       struct virtio_pmem_request *req_inflight;
+
+       /* Fail fast and wake waiters if the request virtqueue is broken. */
+       bool broken;
+
        /* Synchronize virtqueue data */
        spinlock_t pmem_lock;
 
@@ -61,5 +67,7 @@ struct virtio_pmem {
 };
 
 void virtio_pmem_host_ack(struct virtqueue *vq);
+void virtio_pmem_mark_broken(struct virtio_pmem *vpmem);
+void virtio_pmem_drain(struct virtio_pmem *vpmem);
 int async_pmem_flush(struct nd_region *nd_region, struct bio *bio);
 #endif
-- 
2.52.0

Reply via email to