I am at a loss.  I am hacking some RDMA code to do an RDMA write from a server
to a client.

I have it working perfectly on a small 2 node test system.  When I move the
code to another system I am getting a "transport retry counter exceeded"
error.  I just can't figure out why an RDMA Write is timing out like this.

What ibv_qp_init_attr's in ibv_create_qp or ibv_qp_attr's in ibv_modify_qp 
might I
have to change to account for different hardware?

NOTE: On both of these tests I am trying to xfer data from 2 nodes on the same
switch.  The hardware is different and the payload in small (<512 bytes).

Here is my init code on the server side:

rdma_create_qp...

        struct ibv_qp *qp;

        struct ibv_qp_init_attr attr = {
                .send_cq = p_sa->rdma_ctx.cq,
                .recv_cq = p_sa->rdma_ctx.cq,
                .cap     = {
                        .max_send_wr  = 10,
                        .max_recv_wr  = 500,
                        .max_send_sge = 1,
                        .max_recv_sge = 1
                },
                .qp_type = IBV_QPT_RC
        };
        
        qp = ibv_create_qp(p_sa->rdma_ctx.pd, &attr);
        if (!qp) {
                return (IB_INSUFFICIENT_RESOURCES);
        }

        {
                struct ibv_qp_attr attr = {
                        .qp_state        = IBV_QPS_INIT,
                        .pkey_index      = 0,
                        .port_num        = rdma_ctx.device_port,
                        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | 
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE
                };

                if (ibv_modify_qp(qp, &attr,
                                  IBV_QP_STATE              |
                                  IBV_QP_PKEY_INDEX         |
                                  IBV_QP_PORT               |
                                  IBV_QP_ACCESS_FLAGS)) {
                        goto DestroyQP;
                }
        }
...

rdma_modify_qp...

        /* transition it to RTR/RTS with eth_info and path record */
        struct ibv_qp_attr attr = {
                .qp_state               = IBV_QPS_RTR,
                .path_mtu               = path->mtu,
                .dest_qp_num            = eth_info->qpn,
                .rq_psn                 = 1,
                .max_dest_rd_atomic     = 1,
                .min_rnr_timer          = 12,
                .ah_attr                = {
                        .is_global      = 0,
                        .dlid           = path->dlid,
                        .sl             = path->sl,
                        .src_path_bits  = 0,
                        .port_num       = rdma_ctx.device_port
                }
        };

        if (ibv_modify_qp(qp, &attr,
                          IBV_QP_STATE              |
                          IBV_QP_PATH_MTU           |
                          IBV_QP_DEST_QPN           |
                          IBV_QP_RQ_PSN             |
                          IBV_QP_MAX_DEST_RD_ATOMIC |
                          IBV_QP_MIN_RNR_TIMER      |
                          IBV_QP_AV)) {
                return 1;
        }

        attr.qp_state       = IBV_QPS_RTS;
        attr.timeout        = 14;
        attr.retry_cnt      = 7;
        attr.rnr_retry      = 7;
        attr.sq_psn         = 1;
        attr.max_rd_atomic  = 1;
        if (ibv_modify_qp(qp, &attr,
                          IBV_QP_STATE              |
                          IBV_QP_TIMEOUT            |
                          IBV_QP_RETRY_CNT          |
                          IBV_QP_RNR_RETRY          |
                          IBV_QP_SQ_PSN             |
                          IBV_QP_MAX_QP_RD_ATOMIC)) {
                return 1;
        }
...


Here is the code on the client:

rdma_create_qp...

        struct ibv_qp_init_attr attr = {
                .send_cq = rdma_ctx.cq,
                .recv_cq = rdma_ctx.cq,
                .cap     = {
                        .max_send_wr  = 10,
                        .max_recv_wr  = 10,
                        .max_send_sge = 1,
                        .max_recv_sge = 1
                },
                .qp_type = IBV_QPT_RC
        };
        
        qp = ibv_create_qp(rdma_ctx.pd, &attr);
        if (!qp) {
                return (-ENOMEM);
        }

        {
                struct ibv_qp_attr attr = {
                        .qp_state        = IBV_QPS_INIT,
                        .pkey_index      = 0,
                        .port_num        = rdma_ctx.device_port,
                        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | 
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE
                };

                if (ibv_modify_qp(qp, &attr,
                                  IBV_QP_STATE              |
                                  IBV_QP_PKEY_INDEX         |
                                  IBV_QP_PORT               |
                                  IBV_QP_ACCESS_FLAGS)) {
                        return (-ENOMEM);
                }
        }

...

rdma_connect_qp...

        struct ibv_qp_attr attr = {
                .qp_state               = IBV_QPS_RTR,
                .path_mtu               = conn->path.mtu,
                .dest_qp_num            = conn->rqpn,
                .rq_psn                 = 1,
                .max_dest_rd_atomic     = 1,
                .min_rnr_timer          = 12,
                .ah_attr                = {
                        .is_global      = 0,
                        .dlid           = conn->path.dlid,
                        .sl             = conn->path.sl,
                        .src_path_bits  = 0,
                        .port_num       = rdma_ctx.device_port
                }
        };

        if (ibv_modify_qp(conn->qp, &attr,
                          IBV_QP_STATE              |
                          IBV_QP_PATH_MTU           |
                          IBV_QP_DEST_QPN           |
                          IBV_QP_RQ_PSN             |
                          IBV_QP_MAX_DEST_RD_ATOMIC |
                          IBV_QP_MIN_RNR_TIMER      |
                          IBV_QP_AV)) {
                return 1;
        }

        attr.qp_state        = IBV_QPS_RTS;
        attr.timeout         = 14;
        attr.retry_cnt       = 7;
        attr.rnr_retry       = 7;
        attr.sq_psn          = 1;
        attr.max_rd_atomic   = 1;
        if (ibv_modify_qp(conn->qp, &attr,
                          IBV_QP_STATE              |
                          IBV_QP_TIMEOUT            |
                          IBV_QP_RETRY_CNT          |
                          IBV_QP_RNR_RETRY          |
                          IBV_QP_SQ_PSN             |
                          IBV_QP_MAX_QP_RD_ATOMIC)) {
                return 1;
        }
...


Thanks,
Ira

-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
[email protected]
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to