libmlx4: Implement XRC (eXtended RC) support.

Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]>

diff --git a/src/cq.c b/src/cq.c
index 06fd2ca..c0d7a8b 100644
--- a/src/cq.c
+++ b/src/cq.c
@@ -196,9 +196,11 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
        struct mlx4_cqe *cqe;
        struct mlx4_srq *srq;
        uint32_t qpn;
+       uint32_t srqn;
        uint16_t wqe_index;
        int is_error;
        int is_send;
+       int is_src_recv = 0;
 
        cqe = next_cqe_sw(cq);
        if (!cqe)
@@ -220,20 +222,30 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
        is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
                MLX4_CQE_OPCODE_ERROR;
 
-       if (!*cur_qp ||
-           (ntohl(cqe->my_qpn) & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
-               /*
-                * We do not have to take the QP table lock here,
-                * because CQs will be locked while QPs are removed
-                * from the table.
-                */
-               *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
-                                      ntohl(cqe->my_qpn) & 0xffffff);
-               if (!*cur_qp)
+       if (qpn & MLX4_XRC_QPN_BIT && !is_send) {
+               srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff;
+                /*
+                 * We do not have to take the XRC SRQ table lock here,
+                 * because CQs will be locked while XRC SRQs are removed
+                 * from the table.
+                 */
+               srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn);
+               if (!srq)
                        return CQ_POLL_ERR;
-       }
-
-       wc->qp_num = (*cur_qp)->ibv_qp.qp_num;
+               is_src_recv = 1;
+        } else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
+                /*
+                 * We do not have to take the QP table lock here,
+                 * because CQs will be locked while QPs are removed
+                 * from the table.
+                 */
+                *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
+                                       qpn & 0xffffff);
+                if (!*cur_qp)
+                        return CQ_POLL_ERR;
+        }
+
+       wc->qp_num = qpn & 0xffffff;
 
        if (is_send) {
                wq = &(*cur_qp)->sq;
@@ -241,6 +253,10 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
                wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
                wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
                ++wq->tail;
+       } else if (is_src_recv) {
+               wqe_index = htons(cqe->wqe_index);
+               wc->wr_id = srq->wrid[wqe_index];
+               mlx4_free_srq_wqe(srq, wqe_index);
        } else if ((*cur_qp)->ibv_qp.srq) {
                srq = to_msrq((*cur_qp)->ibv_qp.srq);
                wqe_index = htons(cqe->wqe_index);
@@ -386,6 +402,10 @@ void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, 
struct mlx4_srq *srq)
        uint32_t prod_index;
        uint8_t owner_bit;
        int nfreed = 0;
+       int is_xrc_srq = 0;
+
+       if (srq && srq->ibv_srq.xrc_cq)
+               is_xrc_srq = 1;
 
        pthread_spin_lock(&cq->lock);
 
@@ -406,7 +426,12 @@ void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, 
struct mlx4_srq *srq)
         */
        while ((int) --prod_index - (int) cq->cons_index >= 0) {
                cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
-               if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
+               if (is_xrc_srq &&
+                   (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
+                   !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
+                       mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
+                       ++nfreed;
+               } else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
                        if (srq && !(cqe->owner_sr_opcode & 
MLX4_CQE_IS_SEND_MASK))
                                mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
                        ++nfreed;
diff --git a/src/mlx4-abi.h b/src/mlx4-abi.h
index 20a40c9..5c04145 100644
--- a/src/mlx4-abi.h
+++ b/src/mlx4-abi.h
@@ -68,6 +68,14 @@ struct mlx4_resize_cq {
        __u64                           buf_addr;
 };
 
+#ifdef HAVE_IBV_CREATE_XRC_SRQ
+struct mlx4_create_xrc_srq {
+       struct ibv_create_xrc_srq       ibv_cmd;
+       __u64                           buf_addr;
+       __u64                           db_addr;
+};
+#endif
+
 struct mlx4_create_srq {
        struct ibv_create_srq           ibv_cmd;
        __u64                           buf_addr;
@@ -90,4 +98,12 @@ struct mlx4_create_qp {
        __u8                            reserved[5];
 };
 
+#ifdef HAVE_IBV_CREATE_XRC_SRQ
+struct mlx4_open_xrc_domain_resp {
+       struct ibv_open_xrc_domain_resp ibv_resp;
+       __u32                           xrcdn;
+       __u32                           reserved;
+};
+#endif
+
 #endif /* MLX4_ABI_H */
diff --git a/src/mlx4.c b/src/mlx4.c
index b2e2ba9..95902cd 100644
--- a/src/mlx4.c
+++ b/src/mlx4.c
@@ -94,6 +94,11 @@ static struct ibv_context_ops mlx4_ctx_ops = {
        .post_recv     = mlx4_post_recv,
        .create_ah     = mlx4_create_ah,
        .destroy_ah    = mlx4_destroy_ah,
+#ifdef HAVE_IBV_CREATE_XRC_SRQ
+       .create_xrc_srq   = mlx4_create_xrc_srq,
+       .open_xrc_domain  = mlx4_open_xrc_domain,
+       .close_xrc_domain = mlx4_close_xrc_domain,
+#endif
        .attach_mcast  = mlx4_attach_mcast,
        .detach_mcast  = mlx4_detach_mcast
 };
@@ -123,6 +128,15 @@ static struct ibv_context *mlx4_alloc_context(struct 
ibv_device *ibdev, int cmd_
        for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
                context->qp_table[i].refcnt = 0;
 
+       context->num_xrc_srqs   = resp.qp_tab_size;
+       context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1
+                                      - MLX4_XRC_SRQ_TABLE_BITS;
+       context->qp_table_mask  = (1 << context->xrc_srq_table_shift) - 1;
+
+       pthread_mutex_init(&context->xrc_srq_table_mutex, NULL);
+       for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i)
+               context->xrc_srq_table[i].refcnt = 0;
+
        for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
                context->db_list[i] = NULL;
 
diff --git a/src/mlx4.h b/src/mlx4.h
index 3710a17..deb0f55 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -97,6 +97,16 @@ enum {
        MLX4_QP_TABLE_MASK              = MLX4_QP_TABLE_SIZE - 1
 };
 
+enum {
+       MLX4_XRC_SRQ_TABLE_BITS         = 8,
+       MLX4_XRC_SRQ_TABLE_SIZE         = 1 << MLX4_XRC_SRQ_TABLE_BITS,
+       MLX4_XRC_SRQ_TABLE_MASK         = MLX4_XRC_SRQ_TABLE_SIZE - 1
+};
+
+enum {
+       MLX4_XRC_QPN_BIT                = (1 << 23)
+};
+
 enum mlx4_db_type {
        MLX4_DB_TYPE_CQ,
        MLX4_DB_TYPE_RQ,
@@ -157,6 +167,15 @@ struct mlx4_context {
        int                             qp_table_shift;
        int                             qp_table_mask;
 
+       struct {
+               struct mlx4_srq       **table;
+               int                     refcnt;
+       }                               xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE];
+       pthread_mutex_t                 xrc_srq_table_mutex;
+       int                             num_xrc_srqs;
+       int                             xrc_srq_table_shift;
+       int                             xrc_srq_table_mask;
+
        struct mlx4_db_page            *db_list[MLX4_NUM_DB_TYPE];
        pthread_mutex_t                 db_list_mutex;
 };
@@ -242,6 +261,11 @@ struct mlx4_ah {
        struct mlx4_av                  av;
 };
 
+struct mlx4_xrc_domain {
+       struct ibv_xrc_domain           ibv_xrcd;
+       uint32_t                        xrcdn;
+};
+
 static inline unsigned long align(unsigned long val, unsigned long align)
 {
        return (val + align - 1) & ~(align - 1);
@@ -286,6 +310,13 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
        return to_mxxx(ah, ah);
 }
 
+#ifdef HAVE_IBV_CREATE_XRC_SRQ
+static inline struct mlx4_xrc_domain *to_mxrcd(struct ibv_xrc_domain *ibxrcd)
+{
+       return to_mxxx(xrcd, xrc_domain);
+}
+#endif
+
 int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
 void mlx4_free_buf(struct mlx4_buf *buf);
 
@@ -317,7 +348,7 @@ void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn,
 void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe);
 
 struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
-                                struct ibv_srq_init_attr *attr);
+                               struct ibv_srq_init_attr *attr);
 int mlx4_modify_srq(struct ibv_srq *srq,
                     struct ibv_srq_attr *attr,
                     enum ibv_srq_attr_mask mask);
@@ -330,6 +361,10 @@ void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
 int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
                       struct ibv_recv_wr *wr,
                       struct ibv_recv_wr **bad_wr);
+struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t 
xrc_srqn);
+int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
+                      struct mlx4_srq *srq);
+void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
 
 struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr 
*attr);
 int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
@@ -360,5 +395,16 @@ int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr 
*attr,
 void mlx4_free_av(struct mlx4_ah *ah);
 int mlx4_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 int mlx4_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
+#ifdef HAVE_IBV_CREATE_XRC_SRQ
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
+                                   struct ibv_xrc_domain *xrc_domain,
+                                   struct ibv_cq *xrc_cq,
+                                   struct ibv_srq_init_attr *attr);
+struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
+                                           int fd, int oflag);
+
+int mlx4_close_xrc_domain(struct ibv_xrc_domain *d);
+#endif
+
 
 #endif /* MLX4_H */
diff --git a/src/qp.c b/src/qp.c
index ae0ae82..defe346 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -157,7 +157,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr 
*wr,
                ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
                qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 
-               ctrl->srcrb_flags =
+               ctrl->xrcrb_flags =
                        (wr->send_flags & IBV_SEND_SIGNALED ?
                         htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
                        (wr->send_flags & IBV_SEND_SOLICITED ?
@@ -174,6 +174,9 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr 
*wr,
                size = sizeof *ctrl / 16;
 
                switch (ibqp->qp_type) {
+               case IBV_QPT_XRC:
+                        ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 
8);
+                        /* fall thru */
                case IBV_QPT_RC:
                case IBV_QPT_UC:
                        switch (wr->opcode) {
diff --git a/src/srq.c b/src/srq.c
index ba2ceb9..3cd1a95 100644
--- a/src/srq.c
+++ b/src/srq.c
@@ -167,3 +167,53 @@ int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct 
ibv_srq_attr *attr,
 
        return 0;
 }
+
+struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
+{
+       int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> 
ctx->xrc_srq_table_shift;
+
+       if (ctx->xrc_srq_table[tind].refcnt)
+               return ctx->xrc_srq_table[tind].table[xrc_srqn & 
ctx->xrc_srq_table_mask];
+       else
+               return NULL;
+}
+
+int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
+                      struct mlx4_srq *srq)
+{
+       int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> 
ctx->xrc_srq_table_shift;
+       int ret = 0;
+
+       pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
+
+       if (!ctx->xrc_srq_table[tind].refcnt) {
+               ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask 
+ 1,
+                                                  sizeof (struct mlx4_srq *));
+               if (!ctx->xrc_srq_table[tind].table) {
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+       ++ctx->xrc_srq_table[tind].refcnt;
+       ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = 
srq;
+
+out:
+       pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
+       return ret;
+}
+
+void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
+{
+       int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> 
ctx->xrc_srq_table_shift;
+
+       pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
+
+       if (!--ctx->xrc_srq_table[tind].refcnt)
+               free(ctx->xrc_srq_table[tind].table);
+       else
+               ctx->qp_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = 
NULL;
+
+       pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
+}
+
diff --git a/src/verbs.c b/src/verbs.c
index b0273a1..728da64 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -246,7 +246,7 @@ int mlx4_destroy_cq(struct ibv_cq *cq)
 }
 
 struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
-                                struct ibv_srq_init_attr *attr)
+                               struct ibv_srq_init_attr *attr)
 {
        struct mlx4_create_srq      cmd;
        struct mlx4_create_srq_resp resp;
@@ -287,7 +287,6 @@ struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
                goto err_db;
 
        srq->srqn = resp.srqn;
-
        return &srq->ibv_srq;
 
 err_db:
@@ -320,18 +319,36 @@ int mlx4_query_srq(struct ibv_srq *srq,
        return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
 }
 
-int mlx4_destroy_srq(struct ibv_srq *srq)
+int mlx4_destroy_srq(struct ibv_srq *ibsrq)
 {
+       struct mlx4_srq *srq = to_msrq(ibsrq);
+       struct mlx4_cq *mcq = NULL;
        int ret;
 
-       ret = ibv_cmd_destroy_srq(srq);
-       if (ret)
+       if (ibsrq->xrc_cq) {
+               /* is an xrc_srq */
+               mcq = to_mcq(ibsrq->xrc_cq);
+               mlx4_cq_clean(mcq, 0, srq);
+               pthread_spin_lock(&mcq->lock);
+               mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn);
+               pthread_spin_unlock(&mcq->lock);
+       }
+
+       ret = ibv_cmd_destroy_srq(ibsrq);
+       if (ret) {
+               if (ibsrq->xrc_cq) {
+                       pthread_spin_lock(&mcq->lock);
+                       mlx4_store_xrc_srq(to_mctx(ibsrq->context),
+                                          srq->srqn, srq);
+                       pthread_spin_unlock(&mcq->lock);
+               }
                return ret;
+       }
 
-       mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
-       mlx4_free_buf(&to_msrq(srq)->buf);
-       free(to_msrq(srq)->wrid);
-       free(to_msrq(srq));
+       mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db);
+       mlx4_free_buf(&srq->buf);
+       free(srq->wrid);
+       free(srq);
 
        return 0;
 }
@@ -606,3 +623,103 @@ int mlx4_detach_mcast(struct ibv_qp *qp, union ibv_gid 
*gid, uint16_t lid)
 {
        return ibv_cmd_detach_mcast(qp, gid, lid);
 }
+
+#ifdef HAVE_IBV_CREATE_XRC_SRQ
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
+                                   struct ibv_xrc_domain *xrc_domain,
+                                   struct ibv_cq *xrc_cq,
+                                   struct ibv_srq_init_attr *attr)
+{
+       struct mlx4_create_xrc_srq  cmd;
+       struct mlx4_create_srq_resp resp;
+       struct mlx4_srq            *srq;
+       int                         ret;
+
+       /* Sanity check SRQ size before proceeding */
+       if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
+               return NULL;
+
+       srq = malloc(sizeof *srq);
+       if (!srq)
+               return NULL;
+
+       if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+               goto err;
+
+       srq->max     = align_queue_size(attr->attr.max_wr + 1);
+       srq->max_gs  = attr->attr.max_sge;
+       srq->counter = 0;
+
+       if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
+               goto err;
+
+       srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
+       if (!srq->db)
+               goto err_free;
+
+       *srq->db = 0;
+
+       cmd.buf_addr = (uintptr_t) srq->buf.buf;
+       cmd.db_addr  = (uintptr_t) srq->db;
+
+       ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr,
+                                    xrc_domain->handle,
+                                    xrc_cq->handle,
+                                    &cmd.ibv_cmd, sizeof cmd,
+                                    &resp.ibv_resp, sizeof resp);
+       if (ret)
+               goto err_db;
+
+       srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn;
+
+       ret = mlx4_store_xrc_srq(to_mctx(pd->context), 
srq->ibv_srq.xrc_srq_num, srq);
+       if (ret)
+               goto err_destroy;
+
+       return &srq->ibv_srq;
+
+err_destroy:
+       ibv_cmd_destroy_srq(&srq->ibv_srq);
+
+err_db:
+       mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
+
+err_free:
+       free(srq->wrid);
+       mlx4_free_buf(&srq->buf);
+
+err:
+       free(srq);
+
+       return NULL;
+}
+
+struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
+                                           int fd, int oflag)
+{
+       int ret;
+       struct mlx4_open_xrc_domain_resp resp;
+       struct mlx4_xrc_domain *xrcd;
+
+       xrcd = malloc(sizeof *xrcd);
+       if (!xrcd)
+               return NULL;
+
+       ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd,
+                                     &resp.ibv_resp, sizeof resp);
+       if (ret) {
+               free(xrcd);
+               return NULL;
+       }
+
+       xrcd->xrcdn = resp.xrcdn;
+       return &xrcd->ibv_xrcd;
+}
+
+int mlx4_close_xrc_domain(struct ibv_xrc_domain *d)
+{
+       ibv_cmd_close_xrc_domain(d);
+       free(d);
+       return 0;
+}
+#endif
diff --git a/src/wqe.h b/src/wqe.h
index 6f7f309..fa2f8ac 100644
--- a/src/wqe.h
+++ b/src/wqe.h
@@ -65,7 +65,7 @@ struct mlx4_wqe_ctrl_seg {
         * [1]   SE (solicited event)
         * [0]   FL (force loopback)
         */
-       uint32_t                srcrb_flags;
+       uint32_t                xrcrb_flags;
        /*
         * imm is immediate data for send/RDMA write w/ immediate;
         * also invalidation key for send with invalidate; input
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to