- increase timers during subsequent retries,
- check/process create_ah errors during connect phase,
- cleanup some debug messaging.

Signed-off-by: Arlin Davis <[email protected]>
---
 dapl/openib_ucm/cm.c |   81 ++++++++++++++++++++++++++-----------------------
 1 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 96ee382..07b8458 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -163,17 +163,16 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
                *timer = cm->hca->ib_trans.cm_timer; 
                /* wait longer each retry */
                if ((time - cm->timer)/1000 > 
-                   (cm->hca->ib_trans.rep_time * cm->retries)) {
+                   (cm->hca->ib_trans.rep_time << cm->retries)) {
                        dapl_log(DAPL_DBG_TYPE_WARN,
                                 " CM_REQ retry %d [lid, port, qpn]:"
-                                " %x %x %x -> %x %x %x \n", 
-                                cm->retries,
-                                ntohs(cm->msg.saddr.ib.lid), 
-                                ntohs(cm->msg.sport),
-                                ntohl(cm->msg.saddr.ib.qpn), 
-                                ntohs(cm->msg.daddr.ib.lid), 
-                                ntohs(cm->msg.dport),
-                                ntohl(cm->msg.dqpn));
+                                " %x %x %x -> %x %x %x Time(ms) %llu > 
%llu\n", 
+                                cm->retries, ntohs(cm->msg.saddr.ib.lid), 
+                                ntohs(cm->msg.sport), 
ntohl(cm->msg.saddr.ib.qpn), 
+                                ntohs(cm->msg.daddr.ib.lid), 
ntohs(cm->msg.dport),
+                                ntohl(cm->msg.dqpn), (time - cm->timer)/1000, 
+                                cm->hca->ib_trans.rep_time << cm->retries);
+                       cm->retries++;
                        dapl_os_unlock(&cm->lock);
                        dapli_cm_connect(cm->ep, cm);
                        return;
@@ -182,10 +181,10 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
        case DCM_RTU_PENDING: 
                *timer = cm->hca->ib_trans.cm_timer;  
                if ((time - cm->timer)/1000 > 
-                   (cm->hca->ib_trans.rtu_time * cm->retries)) {
+                   (cm->hca->ib_trans.rtu_time << cm->retries)) {
                        dapl_log(DAPL_DBG_TYPE_WARN,
                                 " CM_REPLY retry %d [lid, port, qpn]:"
-                                " %x %x %x -> %x %x %x r_pid %x,%d\n", 
+                                " %x %x %x -> %x %x %x r_pid %x,%d Time(ms) 
%llu > %llu\n", 
                                 cm->retries,
                                 ntohs(cm->msg.saddr.ib.lid), 
                                 ntohs(cm->msg.sport),
@@ -194,7 +193,9 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
                                 ntohs(cm->msg.dport),
                                 ntohl(cm->msg.daddr.ib.qpn),  
                                 ntohl(*(DAT_UINT32*)cm->msg.resv),
-                                ntohl(*(DAT_UINT32*)cm->msg.resv)); 
+                                ntohl(*(DAT_UINT32*)cm->msg.resv), 
+                                (time - cm->timer)/1000, 
cm->hca->ib_trans.rtu_time << cm->retries);
+                       cm->retries++;
                        dapl_os_unlock(&cm->lock);
                        ucm_reply(cm);
                        return;
@@ -204,10 +205,10 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
                *timer = cm->hca->ib_trans.cm_timer; 
                /* wait longer each retry */
                if ((time - cm->timer)/1000 > 
-                   (cm->hca->ib_trans.rep_time)) {
+                   (cm->hca->ib_trans.rtu_time << cm->retries)) {
                        dapl_log(DAPL_DBG_TYPE_WARN,
                                 " CM_DREQ retry %d [lid, port, qpn]:"
-                                " %x %x %x -> %x %x %x r_pid %x,%d\n", 
+                                " %x %x %x -> %x %x %x r_pid %x,%d Time(ms) 
%llu > %llu\n", 
                                 cm->retries,
                                 ntohs(cm->msg.saddr.ib.lid), 
                                 ntohs(cm->msg.sport),
@@ -216,7 +217,9 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
                                 ntohs(cm->msg.dport),
                                 ntohl(cm->msg.dqpn), 
                                 ntohl(*(DAT_UINT32*)cm->msg.resv),
-                                ntohl(*(DAT_UINT32*)cm->msg.resv)); 
+                                ntohl(*(DAT_UINT32*)cm->msg.resv), 
+                                (time - cm->timer)/1000, 
cm->hca->ib_trans.rtu_time << cm->retries);
+                       cm->retries++;
                        dapl_os_unlock(&cm->lock);
                        dapli_cm_disconnect(cm);
                         return;
@@ -448,8 +451,8 @@ retry_listenq:
                        } else {
                                /* duplicate; bail and throw away */
                                dapl_os_unlock(lock);
-                               dapl_log(DAPL_DBG_TYPE_CM,
-                                        " duplicate: op %s st %s [lid, port, 
qpn]:"
+                               dapl_log(DAPL_DBG_TYPE_WARN,
+                                        " DUPLICATE: op %s st %s [lid, port, 
qpn]:"
                                         " 0x%x %d 0x%x <- 0x%x %d 0x%x\n", 
                                         dapl_cm_op_str(ntohs(msg->op)), 
                                         dapl_cm_state_str(cm->state),
@@ -476,7 +479,18 @@ retry_listenq:
        /* not match on listenq for valid request, send reject */
        if (ntohs(msg->op) == DCM_REQ && !found)
                ucm_reject(tp, msg);
-
+#if DAPL_DBG
+       if (!found) {
+               dapl_log(DAPL_DBG_TYPE_WARN,
+                       " ucm_recv: NO MATCH op %s 0x%x %d i0x%x c0x%x"
+                       " < 0x%x %d 0x%x\n", 
+                       dapl_cm_op_str(ntohs(msg->op)), 
+                       ntohs(msg->daddr.ib.lid), ntohs(msg->dport), 
+                       ntohl(msg->daddr.ib.qpn), ntohl(msg->sqpn),
+                       ntohs(msg->saddr.ib.lid), ntohs(msg->sport), 
+                       ntohl(msg->saddr.ib.qpn));
+       }
+#endif
        return found;
 }
 
@@ -524,21 +538,10 @@ retry:
                        continue;
                }
                if (!(cm = ucm_cm_find(tp, msg))) {
-                       dapl_log(DAPL_DBG_TYPE_WARN,
-                                " ucm_recv: NO MATCH op %s 0x%x %d i0x%x c0x%x"
-                                " < 0x%x %d 0x%x\n", 
-                                dapl_cm_op_str(ntohs(msg->op)), 
-                                ntohs(msg->daddr.ib.lid), ntohs(msg->dport), 
-                                ntohl(msg->daddr.ib.qpn),
-                                ntohl(msg->sqpn),
-                                ntohs(msg->saddr.ib.lid), ntohs(msg->sport), 
-                                ntohl(msg->saddr.ib.qpn));
-
                        ucm_post_rmsg(tp, msg);
                        continue;
                }
-               dapl_dbg_log(DAPL_DBG_TYPE_CM, " ucm_recv: match %p\n",cm);
-
+               
                /* match, process it */
                ucm_process_recv(tp, msg, cm);
                ucm_post_rmsg(tp, msg);
@@ -804,14 +807,13 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
                /* send DREQ, event after DREP or DREQ timeout */
                cm->state = DCM_DISC_PENDING;
                cm->msg.op = htons(DCM_DREQ);
-               cm->retries = 1;
                finalize = 0; /* wait for DREP, wakeup timer thread */
                dapls_thread_signal(&cm->hca->ib_trans.signal);
                break;
        case DCM_DISC_PENDING:
                /* DREQ timeout, resend until retries exhausted */
                cm->msg.op = htons(DCM_DREQ);
-               if (cm->retries++ >= cm->hca->ib_trans.retries)
+               if (cm->retries >= cm->hca->ib_trans.retries)
                        finalize = 1;
                break;
        case DCM_DISC_RECV:
@@ -854,7 +856,7 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
                return DAT_INVALID_STATE;
        }
        
-       if (cm->retries++ == cm->hca->ib_trans.retries) {
+       if (cm->retries == cm->hca->ib_trans.retries) {
                dapl_log(DAPL_DBG_TYPE_WARN, 
                        " CM_REQ: RETRIES EXHAUSTED:"
                         " 0x%x %d 0x%x -> 0x%x %d 0x%x\n",
@@ -895,7 +897,7 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
                goto bail;
 
        /* first time through, put on work queue */
-       if (cm->retries == 1)
+       if (!cm->retries)
                ucm_queue_conn(cm);
 
        return DAT_SUCCESS;
@@ -1126,7 +1128,6 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, 
ib_cm_msg_t *msg)
        /* dest CM info from CR msg, source CM info from listen */
        acm->sp = cm->sp;
        acm->hca = cm->hca;
-       acm->retries = 1;
        acm->msg.dport = msg->sport;
        acm->msg.dqpn = msg->sqpn;
        acm->msg.sport = cm->msg.sport; 
@@ -1220,9 +1221,13 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, 
ib_cm_msg_t *msg)
                                                      cm->ep->qp_handle, 
                                                      htons(lid), 
                                                      NULL);
-               if (xevent.remote_ah.ah == NULL) 
+               if (xevent.remote_ah.ah == NULL) {
+                       dapl_log(DAPL_DBG_TYPE_ERR,
+                                " accept_rtu: ERR create_ah"
+                                " for qpn 0x%x lid 0x%x\n",
+                                xevent.remote_ah.qpn, lid);
                        goto bail;
-
+               }
                cm->ah = xevent.remote_ah.ah; /* keep ref to destroy */
                dapl_os_memcpy(&xevent.remote_ah.ia_addr,
                               &cm->msg.daddr,
@@ -1283,7 +1288,7 @@ static int ucm_reply(dp_ib_cm_handle_t cm)
                return -1;
        }
 
-       if (++cm->retries == cm->hca->ib_trans.retries) {
+       if (cm->retries == cm->hca->ib_trans.retries) {
                dapl_log(DAPL_DBG_TYPE_WARN, 
                         " CM_REP: RETRIES EXHAUSTED"
                         " 0x%x %d 0x%x -> 0x%x %d 0x%x\n",
-- 
1.5.2.5

_______________________________________________
ofw mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw

Reply via email to