The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in some fabrics, and there is no retry in rdma_cm so this patch adds process level retries. A combined total of 10 retries for the pair is allowed.
Signed-off-by: David A. McMillen <[email protected]> --- rdma_bw.c | 16 ++++++++++++++++ rdma_lat.c | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 0 deletions(-) diff --git a/rdma_bw.c b/rdma_bw.c index 2628ac4..737558a 100755 --- a/rdma_bw.c +++ b/rdma_bw.c @@ -131,6 +131,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) char *service; int n; int sockfd = -1; + int n_retries = 10; struct rdma_cm_event *event; struct sockaddr_in sin; struct pingpong_context *ctx = NULL; @@ -152,6 +153,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); +retry_addr: if (rdma_resolve_addr(data->cm_id, NULL, (struct sockaddr *)&sin, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n", @@ -162,6 +164,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + + if (event->event == RDMA_CM_EVENT_ADDR_ERROR + && n_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_addr; + } + if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); @@ -169,6 +178,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) } rdma_ack_cm_event(event); +retry_route: if (rdma_resolve_route(data->cm_id, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", pid, __func__); @@ -178,6 +188,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR + && n_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_route; + } + if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); diff --git a/rdma_lat.c b/rdma_lat.c index 3681b35..1f65086 100755 --- a/rdma_lat.c +++ b/rdma_lat.c @@ -207,6 +207,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) char *service; int n; int sockfd = -1; + int n_retries = 10; struct rdma_cm_event *event; struct sockaddr_in sin; struct pingpong_context *ctx = NULL; @@ -228,6 +229,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); +retry_addr: if (rdma_resolve_addr(data->cm_id, NULL, (struct sockaddr *)&sin, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n", @@ -238,6 +240,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + if (event->event == RDMA_CM_EVENT_ADDR_ERROR + && n_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_addr; + } + if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); @@ -245,6 +253,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) } rdma_ack_cm_event(event); +retry_route: if (rdma_resolve_route(data->cm_id, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", pid, __func__); @@ -254,6 +263,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR + && n_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_route; + } + if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
