[PATCH 4.14 110/110] nvme-rdma: dont suppress send completions
4.14-stable review patch. If anyone has any objections, please let me know. -- From: Sagi Grimbergcommit b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 upstream. The entire completions suppress mechanism is currently broken because the HCA might retry a send operation (due to dropped ack) after the nvme transaction has completed. In order to handle this, we signal all send completions and introduce a separate done handler for async events as they will be handled differently (as they don't include in-capsule data by definition). Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/rdma.c | 54 --- 1 file changed, 14 insertions(+), 40 deletions(-) --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -88,7 +88,6 @@ enum nvme_rdma_queue_flags { struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - atomic_tsig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -521,7 +520,6 @@ static int nvme_rdma_alloc_queue(struct queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; - atomic_set(>sig_count, 0); queue->cm_id = rdma_create_id(_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -1232,21 +1230,9 @@ static void nvme_rdma_send_done(struct i nvme_end_request(rq, req->status, req->result); } -/* - * We want to signal completion at least every queue depth/2. This returns the - * largest power of two that is not above half of (queue size + 1) to optimize - * (avoid divisions). - */ -static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) -{ - int limit = 1 << ilog2((queue->queue_size + 1) / 2); - - return (atomic_inc_return(>sig_count) & (limit - 1)) == 0; -} - static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, - struct ib_send_wr *first, bool flush) + struct ib_send_wr *first) { struct ib_send_wr wr, *bad_wr; int ret; @@ -1255,31 +1241,12 @@ static int nvme_rdma_post_send(struct nv sge->length = sizeof(struct nvme_command), sge->lkey = queue->device->pd->local_dma_lkey; - qe->cqe.done = nvme_rdma_send_done; - wr.next = NULL; wr.wr_cqe = >cqe; wr.sg_list= sge; wr.num_sge= num_sge; wr.opcode = IB_WR_SEND; - wr.send_flags = 0; - - /* -* Unsignalled send completions are another giant desaster in the -* IB Verbs spec: If we don't regularly post signalled sends -* the send queue will fill up and only a QP reset will rescue us. -* Would have been way to obvious to handle this in hardware or -* at least the RDMA stack.. -* -* Always signal the flushes. The magic request used for the flush -* sequencer is not allocated in our driver's tagset and it's -* triggered to be freed by blk_cleanup_queue(). So we need to -* always mark it as signaled to ensure that the "wr_cqe", which is -* embedded in request's payload, is not freed when __ib_process_cq() -* calls wr_cqe->done(). -*/ - if (nvme_rdma_queue_sig_limit(queue) || flush) - wr.send_flags |= IB_SEND_SIGNALED; + wr.send_flags = IB_SEND_SIGNALED; if (first) first->next = @@ -1329,6 +1296,12 @@ static struct blk_mq_tags *nvme_rdma_tag return queue->ctrl->tag_set.tags[queue_idx - 1]; } +static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "ASYNC"); +} + static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); @@ -1350,10 +1323,12 @@ static void nvme_rdma_submit_async_event cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); + sqe->cqe.done = nvme_rdma_async_done; + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); - ret = nvme_rdma_post_send(queue, sqe, , 1, NULL, false); + ret = nvme_rdma_post_send(queue, sqe, , 1, NULL); WARN_ON_ONCE(ret); } @@ -1639,7 +1614,6 @@ static blk_status_t nvme_rdma_queue_rq(s struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = >sqe; struct nvme_command *c = sqe->data; - bool flush = false; struct ib_device *dev;
[PATCH 4.14 110/110] nvme-rdma: dont suppress send completions
4.14-stable review patch. If anyone has any objections, please let me know. -- From: Sagi Grimberg commit b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 upstream. The entire completions suppress mechanism is currently broken because the HCA might retry a send operation (due to dropped ack) after the nvme transaction has completed. In order to handle this, we signal all send completions and introduce a separate done handler for async events as they will be handled differently (as they don't include in-capsule data by definition). Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/rdma.c | 54 --- 1 file changed, 14 insertions(+), 40 deletions(-) --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -88,7 +88,6 @@ enum nvme_rdma_queue_flags { struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - atomic_tsig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -521,7 +520,6 @@ static int nvme_rdma_alloc_queue(struct queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; - atomic_set(>sig_count, 0); queue->cm_id = rdma_create_id(_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -1232,21 +1230,9 @@ static void nvme_rdma_send_done(struct i nvme_end_request(rq, req->status, req->result); } -/* - * We want to signal completion at least every queue depth/2. This returns the - * largest power of two that is not above half of (queue size + 1) to optimize - * (avoid divisions). - */ -static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) -{ - int limit = 1 << ilog2((queue->queue_size + 1) / 2); - - return (atomic_inc_return(>sig_count) & (limit - 1)) == 0; -} - static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, - struct ib_send_wr *first, bool flush) + struct ib_send_wr *first) { struct ib_send_wr wr, *bad_wr; int ret; @@ -1255,31 +1241,12 @@ static int nvme_rdma_post_send(struct nv sge->length = sizeof(struct nvme_command), sge->lkey = queue->device->pd->local_dma_lkey; - qe->cqe.done = nvme_rdma_send_done; - wr.next = NULL; wr.wr_cqe = >cqe; wr.sg_list= sge; wr.num_sge= num_sge; wr.opcode = IB_WR_SEND; - wr.send_flags = 0; - - /* -* Unsignalled send completions are another giant desaster in the -* IB Verbs spec: If we don't regularly post signalled sends -* the send queue will fill up and only a QP reset will rescue us. -* Would have been way to obvious to handle this in hardware or -* at least the RDMA stack.. -* -* Always signal the flushes. The magic request used for the flush -* sequencer is not allocated in our driver's tagset and it's -* triggered to be freed by blk_cleanup_queue(). So we need to -* always mark it as signaled to ensure that the "wr_cqe", which is -* embedded in request's payload, is not freed when __ib_process_cq() -* calls wr_cqe->done(). -*/ - if (nvme_rdma_queue_sig_limit(queue) || flush) - wr.send_flags |= IB_SEND_SIGNALED; + wr.send_flags = IB_SEND_SIGNALED; if (first) first->next = @@ -1329,6 +1296,12 @@ static struct blk_mq_tags *nvme_rdma_tag return queue->ctrl->tag_set.tags[queue_idx - 1]; } +static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "ASYNC"); +} + static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); @@ -1350,10 +1323,12 @@ static void nvme_rdma_submit_async_event cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); + sqe->cqe.done = nvme_rdma_async_done; + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); - ret = nvme_rdma_post_send(queue, sqe, , 1, NULL, false); + ret = nvme_rdma_post_send(queue, sqe, , 1, NULL); WARN_ON_ONCE(ret); } @@ -1639,7 +1614,6 @@ static blk_status_t nvme_rdma_queue_rq(s struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = >sqe; struct nvme_command *c = sqe->data; - bool flush = false; struct ib_device *dev; blk_status_t ret; int err; @@ -1668,13 +1642,13 @@ static blk_status_t