with large scale workloads a linux server starts rejecting socket connect requests. Add retry logic for connection refused errors.
increasing net.ipv4.tcp_max_syn_backlog to 2048 will also reduce the chance of these errors when scaling up. Signed-off-by: Arlin Davis <[email protected]> --- dapl/openib_scm/cm.c | 6 ++++-- dapl/openib_scm/dapl_ib_util.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c index f82d0ff..b95db30 100644 --- a/dapl/openib_scm/cm.c +++ b/dapl/openib_scm/cm.c @@ -390,6 +390,7 @@ static dp_ib_cm_handle_t dapli_cm_alloc(DAPL_EP *ep_ptr) cm_ptr->msg.ver = htons(DCM_VER); cm_ptr->socket = DAPL_INVALID_SOCKET; + cm_ptr->retry = SCM_CR_RETRY; dapls_cm_acquire(cm_ptr); /* Link EP and CM */ @@ -507,10 +508,11 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err) &cm_ptr->addr)->sin_addr), ntohs(((struct sockaddr_in *) &cm_ptr->addr)->sin_port), - err == ETIMEDOUT ? "RETRYING...":"ABORTING"); + (err == ETIMEDOUT || err == ECONNREFUSED) ? + "RETRYING...":"ABORTING"); /* retry a timeout */ - if (err == ETIMEDOUT) { + if ((err == ETIMEDOUT) || (ECONNREFUSED && --cm_ptr->retry)) { closesocket(cm_ptr->socket); cm_ptr->socket = DAPL_INVALID_SOCKET; dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h index 4bb1a4a..5f9fb43 100644 --- a/dapl/openib_scm/dapl_ib_util.h +++ b/dapl/openib_scm/dapl_ib_util.h @@ -40,6 +40,7 @@ struct ib_cm_handle DAPL_OS_LOCK lock; int ref_count; int state; + int retry; DAPL_SOCKET socket; struct dapl_hca *hca; struct dapl_sp *sp; @@ -63,6 +64,7 @@ typedef dp_ib_cm_handle_t ib_cm_srvc_handle_t; #define SCM_ACK_RETRY 7 /* 3 bits, 7 * 268ms = 1.8 seconds */ #define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */ #define SCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */ +#define SCM_CR_RETRY 5 /* retries for busy server, connect refused */ #define SCM_IB_MTU 2048 /* Global routing defaults */ -- 1.7.3 _______________________________________________ ofw mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
