George -- Similar question on this one: should it be CMR'ed to v1.7? (I kinda doubt it's appropriate for v1.6)
On Jan 21, 2013, at 6:41 AM, svn-commit-mai...@open-mpi.org wrote: > Author: bosilca (George Bosilca) > Date: 2013-01-21 06:41:08 EST (Mon, 21 Jan 2013) > New Revision: 27881 > URL: https://svn.open-mpi.org/trac/ompi/changeset/27881 > > Log: > Make the TCP BTL really fail-safe. It now trigger the error callback on > all pending fragments when the destination goes down. This allows the PML > to recalibrate its behavior, either find an alternate route or just give up. > > Text files modified: > trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c | 29 > +++++++++++++++++++++++++++-- > trunk/ompi/mca/btl/tcp/btl_tcp_frag.c | 7 ++++++- > > trunk/ompi/mca/btl/tcp/btl_tcp_proc.c | 2 +- > > 3 files changed, 34 insertions(+), 4 deletions(-) > > Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c > ============================================================================== > --- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c Mon Jan 21 06:35:42 2013 > (r27880) > +++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c 2013-01-21 06:41:08 EST (Mon, > 21 Jan 2013) (r27881) > @@ -2,7 +2,7 @@ > * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana > * University Research and Technology > * Corporation. All rights reserved. > - * Copyright (c) 2004-2008 The University of Tennessee and The University > + * Copyright (c) 2004-2013 The University of Tennessee and The University > * of Tennessee Research Foundation. All rights > * reserved. > * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, > @@ -295,6 +295,7 @@ > if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && > opal_socket_errno != EWOULDBLOCK) { > BTL_ERROR(("send() failed: %s (%d)", > strerror(opal_socket_errno), opal_socket_errno)); > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > return -1; > } > @@ -359,6 +360,7 @@ > mca_btl_tcp_endpoint_close(btl_endpoint); > btl_endpoint->endpoint_sd = sd; > if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != > OMPI_SUCCESS) { > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); > OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); > @@ -389,7 +391,6 @@ > { > if(btl_endpoint->endpoint_sd < 0) > return; > - btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; > btl_endpoint->endpoint_retries++; > opal_event_del(&btl_endpoint->endpoint_recv_event); > opal_event_del(&btl_endpoint->endpoint_send_event); > @@ -401,6 +402,24 @@ > btl_endpoint->endpoint_cache_pos = NULL; > btl_endpoint->endpoint_cache_length = 0; > #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ > + /** > + * If we keep failing to connect to the peer let the caller know about > + * this situation by triggering all the pending fragments callback and > + * reporting the error. > + */ > + if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) { > + mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag; > + if( NULL == frag ) > + frag = > (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); > + while(NULL != frag) { > + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, > &frag->base, OMPI_ERR_UNREACH); > + > + frag = > (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); > + } > + } else { > + btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; > + } > + > } > > /* > @@ -444,6 +463,7 @@ > > /* remote closed connection */ > if(retval == 0) { > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > return -1; > } > @@ -453,6 +473,7 @@ > if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && > opal_socket_errno != EWOULDBLOCK) { > BTL_ERROR(("recv(%d) failed: %s (%d)", > btl_endpoint->endpoint_sd, > strerror(opal_socket_errno), opal_socket_errno)); > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > return -1; > } > @@ -589,6 +610,7 @@ > address, > btl_endpoint->endpoint_addr->addr_port, > strerror(opal_socket_errno) ) ); > } > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > btl_endpoint->endpoint_retries++; > return OMPI_ERR_UNREACH; > @@ -599,6 +621,7 @@ > btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; > opal_event_add(&btl_endpoint->endpoint_recv_event, 0); > } else { > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > } > return rc; > @@ -645,6 +668,7 @@ > btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; > opal_event_add(&btl_endpoint->endpoint_recv_event, 0); > } else { > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > } > } > @@ -747,6 +771,7 @@ > default: > OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); > BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > break; > } > > Modified: trunk/ompi/mca/btl/tcp/btl_tcp_frag.c > ============================================================================== > --- trunk/ompi/mca/btl/tcp/btl_tcp_frag.c Mon Jan 21 06:35:42 2013 > (r27880) > +++ trunk/ompi/mca/btl/tcp/btl_tcp_frag.c 2013-01-21 06:41:08 EST (Mon, > 21 Jan 2013) (r27881) > @@ -2,7 +2,7 @@ > * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana > * University Research and Technology > * Corporation. All rights reserved. > - * Copyright (c) 2004-2006 The University of Tennessee and The University > + * Copyright (c) 2004-2013 The University of Tennessee and The University > * of Tennessee Research Foundation. All rights > * reserved. > * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, > @@ -115,12 +115,14 @@ > BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, > %lu)\n\t%s(%lu)\n", > frag->iov_ptr[0].iov_base, (unsigned long) > frag->iov_ptr[0].iov_len, > strerror(opal_socket_errno), (unsigned long) > frag->iov_cnt)); > + frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(frag->endpoint); > return false; > default: > BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", > strerror(opal_socket_errno), > opal_socket_errno)); > + frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(frag->endpoint); > return false; > } > @@ -195,6 +197,7 @@ > cnt = readv(sd, frag->iov_ptr, num_vecs); > if( 0 < cnt ) goto advance_iov_position; > if( cnt == 0 ) { > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > return false; > } > @@ -207,12 +210,14 @@ > BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, > %lu)\n\t%s(%lu)\n", > frag->iov_ptr[0].iov_base, (unsigned long) > frag->iov_ptr[0].iov_len, > strerror(opal_socket_errno), (unsigned long) > frag->iov_cnt)); > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > return false; > default: > BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", > strerror(opal_socket_errno), > opal_socket_errno)); > + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; > mca_btl_tcp_endpoint_close(btl_endpoint); > return false; > } > > Modified: trunk/ompi/mca/btl/tcp/btl_tcp_proc.c > ============================================================================== > --- trunk/ompi/mca/btl/tcp/btl_tcp_proc.c Mon Jan 21 06:35:42 2013 > (r27880) > +++ trunk/ompi/mca/btl/tcp/btl_tcp_proc.c 2013-01-21 06:41:08 EST (Mon, > 21 Jan 2013) (r27881) > @@ -680,7 +680,7 @@ > { > size_t i; > OPAL_THREAD_LOCK(&btl_proc->proc_lock); > - for(i=0; i<btl_proc->proc_endpoint_count; i++) { > + for(i = 0; i < btl_proc->proc_endpoint_count; i++) { > if(btl_proc->proc_endpoints[i] == btl_endpoint) { > memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1, > > (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*)); > _______________________________________________ > svn-full mailing list > svn-f...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn-full -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/