Fix some timeout and long disconnect delay issues discovered
during scale-out testing. Added support to retry rdma_cm address and route
resolution with configuration options and provide a disconnect call when receiving
the disconnect request to force an immediate disconnect reply to the remote
side.
Here are the new options
(environment variables) with the default setting
DAPL_CM_ARP_TIMEOUT_MS
4000
DAPL_CM_ARP_RETRY_COUNT
15
DAPL_CM_ROUTE_TIMEOUT_MS
4000
DAPL_CM_ROUTE_RETRY_COUNT 15
Signed-off by: Arlin Davis [EMAIL PROTECTED]
Index: dapl/openib_cma/dapl_ib_cm.c
===================================================================
--- dapl/openib_cma/dapl_ib_cm.c (revision 9916)
+++ dapl/openib_cma/dapl_ib_cm.c (working copy)
@@ -58,6 +58,9 @@
#include "dapl_ib_util.h"
#include <sys/poll.h>
#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
#include <rdma/rdma_cma_ib.h>
extern struct rdma_event_channel *g_cm_events;
@@ -99,8 +102,8 @@
&ipaddr->src_addr)->sin_addr.s_addr),
ntohl(((struct
sockaddr_in *)
&ipaddr->dst_addr)->sin_addr.s_addr));
-
- ret = rdma_resolve_route(conn->cm_id, 2000);
+
+ ret = rdma_resolve_route(conn->cm_id,
conn->route_timeout);
if (ret) {
dapl_dbg_log(DAPL_DBG_TYPE_ERR,
" rdma_connect failed: %s\n",strerror(errno));
@@ -120,6 +123,7 @@
struct rdma_addr *ipaddr =
&conn->cm_id->route.addr;
struct ib_addr *ibaddr =
&conn->cm_id->route.addr.addr.ibaddr;
#endif
+
dapl_dbg_log(DAPL_DBG_TYPE_CM,
"
route_resolve: cm_id %p SRC %x DST %x PORT %d\n",
conn->cm_id,
@@ -381,6 +385,7 @@
break;
case RDMA_CM_EVENT_DISCONNECTED:
+ rdma_disconnect(conn->cm_id);
/* force the DREP */
/* validate
EP handle */
if
(!DAPL_BAD_HANDLE(conn->ep, DAPL_MAGIC_EP))
dapl_evd_connection_callback(conn,
@@ -494,6 +499,7 @@
break;
case RDMA_CM_EVENT_DISCONNECTED:
+ rdma_disconnect(conn->cm_id);
/* force the DREP */
/* validate
SP handle context */
if
(!DAPL_BAD_HANDLE(conn->sp, DAPL_MAGIC_PSP) ||
!DAPL_BAD_HANDLE(conn->sp, DAPL_MAGIC_RSP))
@@ -543,7 +549,8 @@
IN void *p_data)
{
struct dapl_ep *ep_ptr = ep_handle;
-
+ struct dapl_cm_id *conn;
+
/* Sanity check */
if (NULL == ep_ptr)
return
DAT_SUCCESS;
@@ -552,36 +559,38 @@
r_qual,p_data,p_size);
/* rdma conn and cm_id pre-bound; reference via
qp_handle */
- ep_ptr->cm_handle = ep_ptr->qp_handle;
+ conn = ep_ptr->cm_handle = ep_ptr->qp_handle;
/* Setup QP/CM parameters and private data in
cm_id */
- (void)dapl_os_memzero(&ep_ptr->cm_handle->params,
-
sizeof(ep_ptr->cm_handle->params));
- ep_ptr->cm_handle->params.responder_resources =
IB_TARGET_MAX;
- ep_ptr->cm_handle->params.initiator_depth =
IB_INITIATOR_DEPTH;
- ep_ptr->cm_handle->params.flow_control = 1;
- ep_ptr->cm_handle->params.rnr_retry_count =
IB_RNR_RETRY_COUNT;
- ep_ptr->cm_handle->params.retry_count =
IB_RC_RETRY_COUNT;
+ (void)dapl_os_memzero(&conn->params,
sizeof(conn->params));
+ conn->params.responder_resources = IB_TARGET_MAX;
+ conn->params.initiator_depth = IB_INITIATOR_DEPTH;
+ conn->params.flow_control = 1;
+ conn->params.rnr_retry_count = IB_RNR_RETRY_COUNT;
+ conn->params.retry_count = IB_RC_RETRY_COUNT;
if (p_size) {
- dapl_os_memcpy(ep_ptr->cm_handle->p_data,
p_data, p_size);
- ep_ptr->cm_handle->params.private_data
=
- ep_ptr->cm_handle->p_data;
- ep_ptr->cm_handle->params.private_data_len
= p_size;
+ dapl_os_memcpy(conn->p_data,
p_data, p_size);
+ conn->params.private_data
= conn->p_data;
+ conn->params.private_data_len
= p_size;
}
+ /* copy in remote address, need a copy for retry
attempts */
+ dapl_os_memcpy(&conn->r_addr, r_addr,
sizeof(*r_addr));
+
/* Resolve remote address, src already bound
during QP create */
- ((struct sockaddr_in*)r_addr)->sin_port =
htons(MAKE_PORT(r_qual));
- if (rdma_resolve_addr(ep_ptr->cm_handle->cm_id,
-
NULL, (struct sockaddr *)r_addr, 2000))
+ ((struct
sockaddr_in*)&conn->r_addr)->sin_port = htons(MAKE_PORT(r_qual));
+ ((struct
sockaddr_in*)&conn->r_addr)->sin_family = AF_INET;
+
+ if (rdma_resolve_addr(conn->cm_id, NULL,
+
(struct sockaddr *)&conn->r_addr,
+
conn->arp_timeout))
return
dapl_convert_errno(errno,"ib_connect");
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " connect:
resolve_addr: cm_id %p SRC %x DST %x port %d\n",
- ep_ptr->cm_handle->cm_id,
- ntohl(((struct
sockaddr_in *)
-
&ep_ptr->cm_handle->hca->hca_address)->sin_addr.s_addr),
- ntohl(((struct
sockaddr_in *)r_addr)->sin_addr.s_addr),
- MAKE_PORT(r_qual)
);
+ " connect:
resolve_addr: cm_id %p -> %s port %d\n",
+ conn->cm_id,
+ inet_ntoa(((struct
sockaddr_in *)&conn->r_addr)->sin_addr),
+ ((struct
sockaddr_in*)&conn->r_addr)->sin_port );
return DAT_SUCCESS;
}
@@ -1163,15 +1172,58 @@
case
RDMA_CM_EVENT_ADDR_RESOLVED:
dapli_addr_resolve(conn);
break;
+
case
RDMA_CM_EVENT_ROUTE_RESOLVED:
dapli_route_resolve(conn);
break;
+
case
RDMA_CM_EVENT_ADDR_ERROR:
+ dapl_dbg_log(DAPL_DBG_TYPE_WARN,
+
" CM ADDR ERROR: -> %s retry (%d)..\n",
+
inet_ntoa(((struct sockaddr_in *)
+ &conn->r_addr)->sin_addr),
+ conn->arp_retries);
+
+ /*
retry address resolution */
+ if
(--conn->arp_retries) {
+ int
ret;
+ ret
= rdma_resolve_addr(
+ conn->cm_id,
NULL,
+ (struct
sockaddr *)&conn->r_addr,
+ conn->arp_timeout);
+ if
(!ret)
+ break;
+ else
{
+ dapl_dbg_log(
+ DAPL_DBG_TYPE_WARN,
+ "
ERROR: rdma_resolve_addr = "
+ "%d
%s\n",
+ ret,strerror(errno));
+ }
+ }
+ /*
retries exhausted or resolve_addr failed */
+ dapl_evd_connection_callback(
+ conn,
IB_CME_DESTINATION_UNREACHABLE,
+ NULL,
conn->ep);
+ break;
+
+
case
RDMA_CM_EVENT_ROUTE_ERROR:
- dapl_evd_connection_callback(conn,
-
IB_CME_DESTINATION_UNREACHABLE,
-
NULL, conn->ep);
+ dapl_dbg_log(DAPL_DBG_TYPE_WARN,
+
" CM ROUTE ERROR: -> %s retry (%d)..\n",
+
inet_ntoa(((struct sockaddr_in *)
+ &conn->r_addr)->sin_addr),
+
conn->route_retries );
+
+ /*
retry route resolution */
+ if
(--conn->route_retries)
+ dapli_addr_resolve(conn);
+ else
+ dapl_evd_connection_callback(
conn,
+ IB_CME_DESTINATION_UNREACHABLE,
+ NULL,
conn->ep);
break;
+
case
RDMA_CM_EVENT_DEVICE_REMOVAL:
dapl_evd_connection_callback(conn,
IB_CME_LOCAL_FAILURE,
Index: dapl/openib_cma/dapl_ib_qp.c
===================================================================
--- dapl/openib_cma/dapl_ib_qp.c (revision 10032)
+++ dapl/openib_cma/dapl_ib_qp.c (working copy)
@@ -160,6 +168,17 @@
conn->cm_id = cm_id;
conn->ep = ep_ptr;
conn->hca = ia_ptr->hca_ptr;
+
+ /* setup timers for address and route resolution */
+ conn->arp_timeout =
dapl_os_get_env_val("DAPL_CM_ARP_TIMEOUT_MS",
+ IB_ARP_TIMEOUT);
+ conn->arp_retries =
dapl_os_get_env_val("DAPL_CM_ARP_RETRY_COUNT",
+ IB_ARP_RETRY_COUNT);
+ conn->route_timeout =
dapl_os_get_env_val("DAPL_CM_ROUTE_TIMEOUT_MS",
+
IB_ROUTE_TIMEOUT);
+ conn->route_retries =
dapl_os_get_env_val("DAPL_CM_ROUTE_RETRY_COUNT",
+
IB_ROUTE_RETRY_COUNT);
+
ep_ptr->qp_handle = conn;
ep_ptr->qp_state = IB_QP_STATE_INIT;
Index: dapl/openib_cma/dapl_ib_util.h
===================================================================
--- dapl/openib_cma/dapl_ib_util.h (revision 9916)
+++ dapl/openib_cma/dapl_ib_util.h (working copy)
@@ -67,8 +67,12 @@
#define IB_RC_RETRY_COUNT 7
#define IB_RNR_RETRY_COUNT 7
-#define IB_CM_RESPONSE_TIMEOUT 20 /* 4 sec */
-#define
IB_CM_RETRIES 15
+#define IB_CM_RESPONSE_TIMEOUT 23 /* 16 sec */
+#define
IB_CM_RETRIES 15 /*
240 sec total default */
+#define IB_ARP_TIMEOUT 4000 /* 4 sec
*/
+#define IB_ARP_RETRY_COUNT 15 /* 60 sec total */
+#define IB_ROUTE_TIMEOUT 4000 /* 4 sec */
+#define IB_ROUTE_RETRY_COUNT 15 /* 60 sec total */
#define IB_REQ_MRA_TIMEOUT 27 /* a little
over 9 minutes */
#define IB_MAX_AT_RETRY 3
#define IB_TARGET_MAX 4 /*
max_qp_ous_rd_atom */
@@ -177,12 +181,17 @@
struct dapl_cm_id {
DAPL_OS_LOCK lock;
int destroy;
+ int arp_retries;
+ int arp_timeout;
+ int route_retries;
+ int route_timeout;
int in_callback;
struct rdma_cm_id *cm_id;
struct dapl_hca *hca;
struct dapl_sp *sp;
struct dapl_ep *ep;
struct rdma_conn_param params;
+ DAT_SOCK_ADDR6 r_addr;
int p_len;
unsigned char p_data[IB_MAX_DREP_PDATA_SIZE];
};