Nobody cared about error cases so far, I don't personally see any incentive to push this patch in the 1.7 right now. But I won't be against as it is not hurting either.
George. On Jan 22, 2013, at 16:28 , "Jeff Squyres (jsquyres)" <jsquy...@cisco.com> wrote: > George -- > > Similar question on this one: should it be CMR'ed to v1.7? (I kinda doubt > it's appropriate for v1.6) > > > On Jan 21, 2013, at 6:41 AM, svn-commit-mai...@open-mpi.org wrote: > >> Author: bosilca (George Bosilca) >> Date: 2013-01-21 06:41:08 EST (Mon, 21 Jan 2013) >> New Revision: 27881 >> URL: https://svn.open-mpi.org/trac/ompi/changeset/27881 >> >> Log: >> Make the TCP BTL really fail-safe. It now trigger the error callback on >> all pending fragments when the destination goes down. This allows the PML >> to recalibrate its behavior, either find an alternate route or just give up. >> >> Text files modified: >> trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c | 29 >> +++++++++++++++++++++++++++-- >> trunk/ompi/mca/btl/tcp/btl_tcp_frag.c | 7 ++++++- >> >> trunk/ompi/mca/btl/tcp/btl_tcp_proc.c | 2 +- >> >> 3 files changed, 34 insertions(+), 4 deletions(-) >> >> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c >> ============================================================================== >> --- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c Mon Jan 21 06:35:42 >> 2013 (r27880) >> +++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c 2013-01-21 06:41:08 EST >> (Mon, 21 Jan 2013) (r27881) >> @@ -2,7 +2,7 @@ >> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >> * University Research and Technology >> * Corporation. All rights reserved. >> - * Copyright (c) 2004-2008 The University of Tennessee and The University >> + * Copyright (c) 2004-2013 The University of Tennessee and The University >> * of Tennessee Research Foundation. All rights >> * reserved. >> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >> @@ -295,6 +295,7 @@ >> if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && >> opal_socket_errno != EWOULDBLOCK) { >> BTL_ERROR(("send() failed: %s (%d)", >> strerror(opal_socket_errno), opal_socket_errno)); >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> return -1; >> } >> @@ -359,6 +360,7 @@ >> mca_btl_tcp_endpoint_close(btl_endpoint); >> btl_endpoint->endpoint_sd = sd; >> if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != >> OMPI_SUCCESS) { >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); >> OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); >> @@ -389,7 +391,6 @@ >> { >> if(btl_endpoint->endpoint_sd < 0) >> return; >> - btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; >> btl_endpoint->endpoint_retries++; >> opal_event_del(&btl_endpoint->endpoint_recv_event); >> opal_event_del(&btl_endpoint->endpoint_send_event); >> @@ -401,6 +402,24 @@ >> btl_endpoint->endpoint_cache_pos = NULL; >> btl_endpoint->endpoint_cache_length = 0; >> #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ >> + /** >> + * If we keep failing to connect to the peer let the caller know about >> + * this situation by triggering all the pending fragments callback and >> + * reporting the error. >> + */ >> + if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) { >> + mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag; >> + if( NULL == frag ) >> + frag = >> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); >> + while(NULL != frag) { >> + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, >> &frag->base, OMPI_ERR_UNREACH); >> + >> + frag = >> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); >> + } >> + } else { >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; >> + } >> + >> } >> >> /* >> @@ -444,6 +463,7 @@ >> >> /* remote closed connection */ >> if(retval == 0) { >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> return -1; >> } >> @@ -453,6 +473,7 @@ >> if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && >> opal_socket_errno != EWOULDBLOCK) { >> BTL_ERROR(("recv(%d) failed: %s (%d)", >> btl_endpoint->endpoint_sd, >> strerror(opal_socket_errno), opal_socket_errno)); >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> return -1; >> } >> @@ -589,6 +610,7 @@ >> address, >> btl_endpoint->endpoint_addr->addr_port, >> strerror(opal_socket_errno) ) ); >> } >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> btl_endpoint->endpoint_retries++; >> return OMPI_ERR_UNREACH; >> @@ -599,6 +621,7 @@ >> btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; >> opal_event_add(&btl_endpoint->endpoint_recv_event, 0); >> } else { >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> } >> return rc; >> @@ -645,6 +668,7 @@ >> btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; >> opal_event_add(&btl_endpoint->endpoint_recv_event, 0); >> } else { >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> } >> } >> @@ -747,6 +771,7 @@ >> default: >> OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); >> BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> break; >> } >> >> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_frag.c >> ============================================================================== >> --- trunk/ompi/mca/btl/tcp/btl_tcp_frag.c Mon Jan 21 06:35:42 2013 >> (r27880) >> +++ trunk/ompi/mca/btl/tcp/btl_tcp_frag.c 2013-01-21 06:41:08 EST (Mon, >> 21 Jan 2013) (r27881) >> @@ -2,7 +2,7 @@ >> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >> * University Research and Technology >> * Corporation. All rights reserved. >> - * Copyright (c) 2004-2006 The University of Tennessee and The University >> + * Copyright (c) 2004-2013 The University of Tennessee and The University >> * of Tennessee Research Foundation. All rights >> * reserved. >> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >> @@ -115,12 +115,14 @@ >> BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, >> %lu)\n\t%s(%lu)\n", >> frag->iov_ptr[0].iov_base, (unsigned long) >> frag->iov_ptr[0].iov_len, >> strerror(opal_socket_errno), (unsigned long) >> frag->iov_cnt)); >> + frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(frag->endpoint); >> return false; >> default: >> BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", >> strerror(opal_socket_errno), >> opal_socket_errno)); >> + frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(frag->endpoint); >> return false; >> } >> @@ -195,6 +197,7 @@ >> cnt = readv(sd, frag->iov_ptr, num_vecs); >> if( 0 < cnt ) goto advance_iov_position; >> if( cnt == 0 ) { >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> return false; >> } >> @@ -207,12 +210,14 @@ >> BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, >> %lu)\n\t%s(%lu)\n", >> frag->iov_ptr[0].iov_base, (unsigned long) >> frag->iov_ptr[0].iov_len, >> strerror(opal_socket_errno), (unsigned long) >> frag->iov_cnt)); >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> return false; >> default: >> BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", >> strerror(opal_socket_errno), >> opal_socket_errno)); >> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >> mca_btl_tcp_endpoint_close(btl_endpoint); >> return false; >> } >> >> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_proc.c >> ============================================================================== >> --- trunk/ompi/mca/btl/tcp/btl_tcp_proc.c Mon Jan 21 06:35:42 2013 >> (r27880) >> +++ trunk/ompi/mca/btl/tcp/btl_tcp_proc.c 2013-01-21 06:41:08 EST (Mon, >> 21 Jan 2013) (r27881) >> @@ -680,7 +680,7 @@ >> { >> size_t i; >> OPAL_THREAD_LOCK(&btl_proc->proc_lock); >> - for(i=0; i<btl_proc->proc_endpoint_count; i++) { >> + for(i = 0; i < btl_proc->proc_endpoint_count; i++) { >> if(btl_proc->proc_endpoints[i] == btl_endpoint) { >> memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1, >> >> (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*)); >> _______________________________________________ >> svn-full mailing list >> svn-f...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full > > > -- > Jeff Squyres > jsquy...@cisco.com > For corporate legal information go to: > http://www.cisco.com/web/about/doing_business/legal/cri/ > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel