Large scale all to all connections on +1500 cores
the listen backlog is reached and SYN's are dropped
which causes the connect to timeout. Retry connect
on timeout errors.

Signed-off-by: Arlin Davis <[email protected]>
---
 dapl/openib_scm/cm.c |   24 ++++++++++++++++++++++--
 1 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 7465190..4c8d4a1 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -60,6 +60,12 @@
 #include "dapl_ep_util.h"
 #include "dapl_osd.h"
 
+/* forward declarations */
+static DAT_RETURN
+dapli_socket_connect(DAPL_EP * ep_ptr,
+                    DAT_IA_ADDRESS_PTR r_addr,
+                    DAT_CONN_QUAL r_qual, DAT_COUNT p_size, DAT_PVOID p_data);
+
 #ifdef DAPL_DBG
 /* Check for EP linking to IA and proper connect state */
 void dapli_ep_check(DAPL_EP *ep)
@@ -494,13 +500,27 @@ static void dapli_socket_connected(dp_ib_cm_handle_t 
cm_ptr, int err)
 
        if (err) {
                dapl_log(DAPL_DBG_TYPE_ERR,
-                        " CONN_PENDING: %s ERR %s -> %s %d\n",
+                        " CONN_PENDING: %s ERR %s -> %s %d - %s\n",
                         err == -1 ? "POLL" : "SOCKOPT",
                         err == -1 ? strerror(dapl_socket_errno()) : 
strerror(err), 
                         inet_ntoa(((struct sockaddr_in *)
                                &cm_ptr->addr)->sin_addr), 
                         ntohs(((struct sockaddr_in *)
-                               &cm_ptr->addr)->sin_port));
+                               &cm_ptr->addr)->sin_port),
+                        err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+
+               /* retry a timeout */
+               if (err == ETIMEDOUT) {
+                       closesocket(cm_ptr->socket);
+                       cm_ptr->socket = DAPL_INVALID_SOCKET;
+                       dapli_socket_connect(cm_ptr->ep, 
(DAT_IA_ADDRESS_PTR)&cm_ptr->addr, 
+                                            ntohs(((struct sockaddr_in 
*)&cm_ptr->addr)->sin_port) - 1000,
+                                            ntohs(cm_ptr->msg.p_size), 
&cm_ptr->msg.p_data);
+                       dapl_ep_unlink_cm(cm_ptr->ep, cm_ptr);
+                       dapli_cm_free(cm_ptr);
+                       return;
+               }
+
                goto bail;
        }
 
-- 
1.5.2.5

_______________________________________________
ofw mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw

Reply via email to