[PATCH 4.14 110/110] nvme-rdma: dont suppress send completions

2018-03-07 Thread Greg Kroah-Hartman
4.14-stable review patch.  If anyone has any objections, please let me know.

--

From: Sagi Grimberg 

commit b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 upstream.

The entire completions suppress mechanism is currently broken because the
HCA might retry a send operation (due to dropped ack) after the nvme
transaction has completed.

In order to handle this, we signal all send completions and introduce a
separate done handler for async events as they will be handled differently
(as they don't include in-capsule data by definition).

Signed-off-by: Sagi Grimberg 
Reviewed-by: Max Gurtovoy 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Greg Kroah-Hartman 


---
 drivers/nvme/host/rdma.c |   54 ---
 1 file changed, 14 insertions(+), 40 deletions(-)

--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -88,7 +88,6 @@ enum nvme_rdma_queue_flags {
 
 struct nvme_rdma_queue {
struct nvme_rdma_qe *rsp_ring;
-   atomic_tsig_count;
int queue_size;
size_t  cmnd_capsule_len;
struct nvme_rdma_ctrl   *ctrl;
@@ -521,7 +520,6 @@ static int nvme_rdma_alloc_queue(struct
queue->cmnd_capsule_len = sizeof(struct nvme_command);
 
queue->queue_size = queue_size;
-   atomic_set(>sig_count, 0);
 
queue->cm_id = rdma_create_id(_net, nvme_rdma_cm_handler, queue,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1232,21 +1230,9 @@ static void nvme_rdma_send_done(struct i
nvme_end_request(rq, req->status, req->result);
 }
 
-/*
- * We want to signal completion at least every queue depth/2.  This returns the
- * largest power of two that is not above half of (queue size + 1) to optimize
- * (avoid divisions).
- */
-static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
-{
-   int limit = 1 << ilog2((queue->queue_size + 1) / 2);
-
-   return (atomic_inc_return(>sig_count) & (limit - 1)) == 0;
-}
-
 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
-   struct ib_send_wr *first, bool flush)
+   struct ib_send_wr *first)
 {
struct ib_send_wr wr, *bad_wr;
int ret;
@@ -1255,31 +1241,12 @@ static int nvme_rdma_post_send(struct nv
sge->length = sizeof(struct nvme_command),
sge->lkey   = queue->device->pd->local_dma_lkey;
 
-   qe->cqe.done = nvme_rdma_send_done;
-
wr.next   = NULL;
wr.wr_cqe = >cqe;
wr.sg_list= sge;
wr.num_sge= num_sge;
wr.opcode = IB_WR_SEND;
-   wr.send_flags = 0;
-
-   /*
-* Unsignalled send completions are another giant desaster in the
-* IB Verbs spec:  If we don't regularly post signalled sends
-* the send queue will fill up and only a QP reset will rescue us.
-* Would have been way to obvious to handle this in hardware or
-* at least the RDMA stack..
-*
-* Always signal the flushes. The magic request used for the flush
-* sequencer is not allocated in our driver's tagset and it's
-* triggered to be freed by blk_cleanup_queue(). So we need to
-* always mark it as signaled to ensure that the "wr_cqe", which is
-* embedded in request's payload, is not freed when __ib_process_cq()
-* calls wr_cqe->done().
-*/
-   if (nvme_rdma_queue_sig_limit(queue) || flush)
-   wr.send_flags |= IB_SEND_SIGNALED;
+   wr.send_flags = IB_SEND_SIGNALED;
 
if (first)
first->next = 
@@ -1329,6 +1296,12 @@ static struct blk_mq_tags *nvme_rdma_tag
return queue->ctrl->tag_set.tags[queue_idx - 1];
 }
 
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+   if (unlikely(wc->status != IB_WC_SUCCESS))
+   nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
+
 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 {
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1350,10 +1323,12 @@ static void nvme_rdma_submit_async_event
cmd->common.flags |= NVME_CMD_SGL_METABUF;
nvme_rdma_set_sg_null(cmd);
 
+   sqe->cqe.done = nvme_rdma_async_done;
+
ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
DMA_TO_DEVICE);
 
-   ret = nvme_rdma_post_send(queue, sqe, , 1, NULL, false);
+   ret = nvme_rdma_post_send(queue, sqe, , 1, NULL);
WARN_ON_ONCE(ret);
 }
 
@@ -1639,7 +1614,6 @@ static blk_status_t nvme_rdma_queue_rq(s
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_qe *sqe = >sqe;
struct nvme_command *c = sqe->data;
-   bool flush = false;
struct ib_device *dev;

[PATCH 4.14 110/110] nvme-rdma: dont suppress send completions

2018-03-07 Thread Greg Kroah-Hartman
4.14-stable review patch.  If anyone has any objections, please let me know.

--

From: Sagi Grimberg 

commit b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 upstream.

The entire completions suppress mechanism is currently broken because the
HCA might retry a send operation (due to dropped ack) after the nvme
transaction has completed.

In order to handle this, we signal all send completions and introduce a
separate done handler for async events as they will be handled differently
(as they don't include in-capsule data by definition).

Signed-off-by: Sagi Grimberg 
Reviewed-by: Max Gurtovoy 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Greg Kroah-Hartman 


---
 drivers/nvme/host/rdma.c |   54 ---
 1 file changed, 14 insertions(+), 40 deletions(-)

--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -88,7 +88,6 @@ enum nvme_rdma_queue_flags {
 
 struct nvme_rdma_queue {
struct nvme_rdma_qe *rsp_ring;
-   atomic_tsig_count;
int queue_size;
size_t  cmnd_capsule_len;
struct nvme_rdma_ctrl   *ctrl;
@@ -521,7 +520,6 @@ static int nvme_rdma_alloc_queue(struct
queue->cmnd_capsule_len = sizeof(struct nvme_command);
 
queue->queue_size = queue_size;
-   atomic_set(>sig_count, 0);
 
queue->cm_id = rdma_create_id(_net, nvme_rdma_cm_handler, queue,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1232,21 +1230,9 @@ static void nvme_rdma_send_done(struct i
nvme_end_request(rq, req->status, req->result);
 }
 
-/*
- * We want to signal completion at least every queue depth/2.  This returns the
- * largest power of two that is not above half of (queue size + 1) to optimize
- * (avoid divisions).
- */
-static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
-{
-   int limit = 1 << ilog2((queue->queue_size + 1) / 2);
-
-   return (atomic_inc_return(>sig_count) & (limit - 1)) == 0;
-}
-
 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
-   struct ib_send_wr *first, bool flush)
+   struct ib_send_wr *first)
 {
struct ib_send_wr wr, *bad_wr;
int ret;
@@ -1255,31 +1241,12 @@ static int nvme_rdma_post_send(struct nv
sge->length = sizeof(struct nvme_command),
sge->lkey   = queue->device->pd->local_dma_lkey;
 
-   qe->cqe.done = nvme_rdma_send_done;
-
wr.next   = NULL;
wr.wr_cqe = >cqe;
wr.sg_list= sge;
wr.num_sge= num_sge;
wr.opcode = IB_WR_SEND;
-   wr.send_flags = 0;
-
-   /*
-* Unsignalled send completions are another giant desaster in the
-* IB Verbs spec:  If we don't regularly post signalled sends
-* the send queue will fill up and only a QP reset will rescue us.
-* Would have been way to obvious to handle this in hardware or
-* at least the RDMA stack..
-*
-* Always signal the flushes. The magic request used for the flush
-* sequencer is not allocated in our driver's tagset and it's
-* triggered to be freed by blk_cleanup_queue(). So we need to
-* always mark it as signaled to ensure that the "wr_cqe", which is
-* embedded in request's payload, is not freed when __ib_process_cq()
-* calls wr_cqe->done().
-*/
-   if (nvme_rdma_queue_sig_limit(queue) || flush)
-   wr.send_flags |= IB_SEND_SIGNALED;
+   wr.send_flags = IB_SEND_SIGNALED;
 
if (first)
first->next = 
@@ -1329,6 +1296,12 @@ static struct blk_mq_tags *nvme_rdma_tag
return queue->ctrl->tag_set.tags[queue_idx - 1];
 }
 
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+   if (unlikely(wc->status != IB_WC_SUCCESS))
+   nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
+
 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 {
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1350,10 +1323,12 @@ static void nvme_rdma_submit_async_event
cmd->common.flags |= NVME_CMD_SGL_METABUF;
nvme_rdma_set_sg_null(cmd);
 
+   sqe->cqe.done = nvme_rdma_async_done;
+
ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
DMA_TO_DEVICE);
 
-   ret = nvme_rdma_post_send(queue, sqe, , 1, NULL, false);
+   ret = nvme_rdma_post_send(queue, sqe, , 1, NULL);
WARN_ON_ONCE(ret);
 }
 
@@ -1639,7 +1614,6 @@ static blk_status_t nvme_rdma_queue_rq(s
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_qe *sqe = >sqe;
struct nvme_command *c = sqe->data;
-   bool flush = false;
struct ib_device *dev;
blk_status_t ret;
int err;
@@ -1668,13 +1642,13 @@ static blk_status_t