with large scale workloads a linux server starts rejecting
socket connect requests. Add retry logic for connection refused
errors.

increasing net.ipv4.tcp_max_syn_backlog to 2048 will also reduce the
chance of these errors when scaling up.

Signed-off-by: Arlin Davis <[email protected]>
---
 dapl/openib_scm/cm.c           |    6 ++++--
 dapl/openib_scm/dapl_ib_util.h |    2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index f82d0ff..b95db30 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -390,6 +390,7 @@ static dp_ib_cm_handle_t dapli_cm_alloc(DAPL_EP *ep_ptr)
 
        cm_ptr->msg.ver = htons(DCM_VER);
        cm_ptr->socket = DAPL_INVALID_SOCKET;
+       cm_ptr->retry = SCM_CR_RETRY;
        dapls_cm_acquire(cm_ptr);
                
        /* Link EP and CM */
@@ -507,10 +508,11 @@ static void dapli_socket_connected(dp_ib_cm_handle_t 
cm_ptr, int err)
                                &cm_ptr->addr)->sin_addr), 
                         ntohs(((struct sockaddr_in *)
                                &cm_ptr->addr)->sin_port),
-                        err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+                        (err == ETIMEDOUT || err == ECONNREFUSED) ? 
+                        "RETRYING...":"ABORTING");
 
                /* retry a timeout */
-               if (err == ETIMEDOUT) {
+               if ((err == ETIMEDOUT) || (ECONNREFUSED && --cm_ptr->retry)) {
                        closesocket(cm_ptr->socket);
                        cm_ptr->socket = DAPL_INVALID_SOCKET;
                        dapli_socket_connect(cm_ptr->ep, 
(DAT_IA_ADDRESS_PTR)&cm_ptr->addr, 
diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h
index 4bb1a4a..5f9fb43 100644
--- a/dapl/openib_scm/dapl_ib_util.h
+++ b/dapl/openib_scm/dapl_ib_util.h
@@ -40,6 +40,7 @@ struct ib_cm_handle
        DAPL_OS_LOCK            lock;
        int                     ref_count;
        int                     state;
+       int                     retry;
        DAPL_SOCKET             socket;
        struct dapl_hca         *hca;
        struct dapl_sp          *sp;    
@@ -63,6 +64,7 @@ typedef dp_ib_cm_handle_t     ib_cm_srvc_handle_t;
 #define SCM_ACK_RETRY 7  /* 3 bits, 7 * 268ms = 1.8 seconds */
 #define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
 #define SCM_RNR_RETRY 7  /* 3 bits, 7 == infinite */
+#define SCM_CR_RETRY  5  /* retries for busy server, connect refused */
 #define SCM_IB_MTU    2048
 
 /* Global routing defaults */
-- 
1.7.3



_______________________________________________
ofw mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw

Reply via email to