While we always strive to improve this functionality, it was available as a separate software packages for quite some time.
George. On Jan 23, 2013, at 08:05 , Jeff Squyres (jsquyres) <jsquy...@cisco.com> wrote: > Are you going to develop anything further with regards to this functionality, > and target that stuff for v1.7? Or should all of this just wait until 1.9? > > (I don't really care either way; I'm asking out of curiosity) > > > On Jan 22, 2013, at 7:24 PM, George Bosilca <bosi...@icl.utk.edu> wrote: > >> Nobody cared about error cases so far, I don't personally see any incentive >> to push this patch in the 1.7 right now. But I won't be against as it is not >> hurting either. >> >> George. >> >> >> On Jan 22, 2013, at 16:28 , "Jeff Squyres (jsquyres)" <jsquy...@cisco.com> >> wrote: >> >>> George -- >>> >>> Similar question on this one: should it be CMR'ed to v1.7? (I kinda doubt >>> it's appropriate for v1.6) >>> >>> >>> On Jan 21, 2013, at 6:41 AM, svn-commit-mai...@open-mpi.org wrote: >>> >>>> Author: bosilca (George Bosilca) >>>> Date: 2013-01-21 06:41:08 EST (Mon, 21 Jan 2013) >>>> New Revision: 27881 >>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/27881 >>>> >>>> Log: >>>> Make the TCP BTL really fail-safe. It now trigger the error callback on >>>> all pending fragments when the destination goes down. This allows the PML >>>> to recalibrate its behavior, either find an alternate route or just give >>>> up. >>>> >>>> Text files modified: >>>> trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c | 29 >>>> +++++++++++++++++++++++++++-- >>>> trunk/ompi/mca/btl/tcp/btl_tcp_frag.c | 7 ++++++- >>>> >>>> trunk/ompi/mca/btl/tcp/btl_tcp_proc.c | 2 +- >>>> >>>> 3 files changed, 34 insertions(+), 4 deletions(-) >>>> >>>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c Mon Jan 21 06:35:42 >>>> 2013 (r27880) >>>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c 2013-01-21 06:41:08 EST >>>> (Mon, 21 Jan 2013) (r27881) >>>> @@ -2,7 +2,7 @@ >>>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >>>> * University Research and Technology >>>> * Corporation. All rights reserved. >>>> - * Copyright (c) 2004-2008 The University of Tennessee and The University >>>> + * Copyright (c) 2004-2013 The University of Tennessee and The University >>>> * of Tennessee Research Foundation. All rights >>>> * reserved. >>>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >>>> @@ -295,6 +295,7 @@ >>>> if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && >>>> opal_socket_errno != EWOULDBLOCK) { >>>> BTL_ERROR(("send() failed: %s (%d)", >>>> strerror(opal_socket_errno), opal_socket_errno)); >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> return -1; >>>> } >>>> @@ -359,6 +360,7 @@ >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> btl_endpoint->endpoint_sd = sd; >>>> if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != >>>> OMPI_SUCCESS) { >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); >>>> OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); >>>> @@ -389,7 +391,6 @@ >>>> { >>>> if(btl_endpoint->endpoint_sd < 0) >>>> return; >>>> - btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; >>>> btl_endpoint->endpoint_retries++; >>>> opal_event_del(&btl_endpoint->endpoint_recv_event); >>>> opal_event_del(&btl_endpoint->endpoint_send_event); >>>> @@ -401,6 +402,24 @@ >>>> btl_endpoint->endpoint_cache_pos = NULL; >>>> btl_endpoint->endpoint_cache_length = 0; >>>> #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ >>>> + /** >>>> + * If we keep failing to connect to the peer let the caller know about >>>> + * this situation by triggering all the pending fragments callback and >>>> + * reporting the error. >>>> + */ >>>> + if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) { >>>> + mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag; >>>> + if( NULL == frag ) >>>> + frag = >>>> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); >>>> + while(NULL != frag) { >>>> + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, >>>> &frag->base, OMPI_ERR_UNREACH); >>>> + >>>> + frag = >>>> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags); >>>> + } >>>> + } else { >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; >>>> + } >>>> + >>>> } >>>> >>>> /* >>>> @@ -444,6 +463,7 @@ >>>> >>>> /* remote closed connection */ >>>> if(retval == 0) { >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> return -1; >>>> } >>>> @@ -453,6 +473,7 @@ >>>> if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && >>>> opal_socket_errno != EWOULDBLOCK) { >>>> BTL_ERROR(("recv(%d) failed: %s (%d)", >>>> btl_endpoint->endpoint_sd, >>>> strerror(opal_socket_errno), opal_socket_errno)); >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> return -1; >>>> } >>>> @@ -589,6 +610,7 @@ >>>> address, >>>> btl_endpoint->endpoint_addr->addr_port, >>>> strerror(opal_socket_errno) ) ); >>>> } >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> btl_endpoint->endpoint_retries++; >>>> return OMPI_ERR_UNREACH; >>>> @@ -599,6 +621,7 @@ >>>> btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; >>>> opal_event_add(&btl_endpoint->endpoint_recv_event, 0); >>>> } else { >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> } >>>> return rc; >>>> @@ -645,6 +668,7 @@ >>>> btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; >>>> opal_event_add(&btl_endpoint->endpoint_recv_event, 0); >>>> } else { >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> } >>>> } >>>> @@ -747,6 +771,7 @@ >>>> default: >>>> OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); >>>> BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> break; >>>> } >>>> >>>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_frag.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/btl/tcp/btl_tcp_frag.c Mon Jan 21 06:35:42 2013 >>>> (r27880) >>>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_frag.c 2013-01-21 06:41:08 EST (Mon, >>>> 21 Jan 2013) (r27881) >>>> @@ -2,7 +2,7 @@ >>>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana >>>> * University Research and Technology >>>> * Corporation. All rights reserved. >>>> - * Copyright (c) 2004-2006 The University of Tennessee and The University >>>> + * Copyright (c) 2004-2013 The University of Tennessee and The University >>>> * of Tennessee Research Foundation. All rights >>>> * reserved. >>>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, >>>> @@ -115,12 +115,14 @@ >>>> BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, >>>> %lu)\n\t%s(%lu)\n", >>>> frag->iov_ptr[0].iov_base, (unsigned long) >>>> frag->iov_ptr[0].iov_len, >>>> strerror(opal_socket_errno), (unsigned long) >>>> frag->iov_cnt)); >>>> + frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(frag->endpoint); >>>> return false; >>>> default: >>>> BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", >>>> strerror(opal_socket_errno), >>>> opal_socket_errno)); >>>> + frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(frag->endpoint); >>>> return false; >>>> } >>>> @@ -195,6 +197,7 @@ >>>> cnt = readv(sd, frag->iov_ptr, num_vecs); >>>> if( 0 < cnt ) goto advance_iov_position; >>>> if( cnt == 0 ) { >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> return false; >>>> } >>>> @@ -207,12 +210,14 @@ >>>> BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, >>>> %lu)\n\t%s(%lu)\n", >>>> frag->iov_ptr[0].iov_base, (unsigned long) >>>> frag->iov_ptr[0].iov_len, >>>> strerror(opal_socket_errno), (unsigned long) >>>> frag->iov_cnt)); >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> return false; >>>> default: >>>> BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", >>>> strerror(opal_socket_errno), >>>> opal_socket_errno)); >>>> + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; >>>> mca_btl_tcp_endpoint_close(btl_endpoint); >>>> return false; >>>> } >>>> >>>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_proc.c >>>> ============================================================================== >>>> --- trunk/ompi/mca/btl/tcp/btl_tcp_proc.c Mon Jan 21 06:35:42 2013 >>>> (r27880) >>>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_proc.c 2013-01-21 06:41:08 EST (Mon, >>>> 21 Jan 2013) (r27881) >>>> @@ -680,7 +680,7 @@ >>>> { >>>> size_t i; >>>> OPAL_THREAD_LOCK(&btl_proc->proc_lock); >>>> - for(i=0; i<btl_proc->proc_endpoint_count; i++) { >>>> + for(i = 0; i < btl_proc->proc_endpoint_count; i++) { >>>> if(btl_proc->proc_endpoints[i] == btl_endpoint) { >>>> memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1, >>>> >>>> (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*)); >>>> _______________________________________________ >>>> svn-full mailing list >>>> svn-f...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full >>> >>> >>> -- >>> Jeff Squyres >>> jsquy...@cisco.com >>> For corporate legal information go to: >>> http://www.cisco.com/web/about/doing_business/legal/cri/ >>> >>> >>> _______________________________________________ >>> devel mailing list >>> de...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >> >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > > > -- > Jeff Squyres > jsquy...@cisco.com > For corporate legal information go to: > http://www.cisco.com/web/about/doing_business/legal/cri/ > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel