Full sendq should retry polling completions instead of failing.
When sendq is full and all requests are pending the get send message
code should retry polling for completions and not return error on first
empty CQ attempt. Give HCA a chance to complete some batched requests.
Also, clean up the send message error logging.

Signed-off-by: Arlin Davis <arlin.r.da...@intel.com>
---
 dapl/openib_ucm/cm.c |   26 +++++++++++++++-----------
 1 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 39ef28d..6b5867a 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -234,38 +234,42 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
 static ib_cm_msg_t *ucm_get_smsg(ib_hca_transport_t *tp)
 {
        ib_cm_msg_t *msg = NULL; 
-       int ret, polled = 0, hd = tp->s_hd;
+       int ret, polled = 1, hd = tp->s_hd;
 
        hd++;
 
        if (hd == tp->qpe)
                hd = 0;
 retry:
-       if (hd == tp->s_tl)
+       if (hd == tp->s_tl) {
                msg = NULL;
+               if (polled % 1000000 == 0)
+                       dapl_log(DAPL_DBG_TYPE_WARN,
+                                " ucm_get_smsg: FULLq hd %d == tl %d,"
+                                " completions stalled, polls=%d\n",
+                                hd, tp->s_tl, polled);
+       }
        else {
                msg = &tp->sbuf[hd];
                tp->s_hd = hd; /* new hd */
        }
 
        /* if empty, process some completions */
-       if ((msg == NULL) && (!polled)) {
+       if (msg == NULL) {
                struct ibv_wc wc;
 
                /* process completions, based on UCM_TX_BURST */
                ret = ibv_poll_cq(tp->scq, 1, &wc);
                if (ret < 0) {
                        dapl_log(DAPL_DBG_TYPE_WARN,
-                               " get_smsg: cq %p %s\n", 
+                               " get_smsg: cq %p %s\n",
                                tp->scq, strerror(errno));
+                       return NULL;
                }
                /* free up completed sends, update tail */
-               if (ret > 0) {
+               if (ret > 0)
                        tp->s_tl = (int)wc.wr_id;
-                       dapl_log(DAPL_DBG_TYPE_CM,
-                               " get_smsg: wr_cmp (%d) s_tl=%d\n", 
-                               wc.status, tp->s_tl);
-               }
+
                polled++;
                goto retry;
        }
@@ -1000,8 +1004,8 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
 
 bail:
        dapl_log(DAPL_DBG_TYPE_WARN, 
-                " connect: ERR %s -> cm_lid %x cm_qpn %x r_psp %x p_sz=%d\n",
-                strerror(errno), htons(cm->msg.daddr.ib.lid), 
+                " connect: snd ERR -> cm_lid %x cm_qpn %x r_psp %x p_sz=%d\n",
+                htons(cm->msg.daddr.ib.lid),
                 htonl(cm->msg.dqpn), htons(cm->msg.dport), 
                 htons(cm->msg.p_size));
 
-- 
1.7.3



_______________________________________________
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg

Reply via email to