Nobody cared about error cases so far, I don't personally see any incentive to 
push this patch in the 1.7 right now. But I won't be against as it is not 
hurting either.

  George.


On Jan 22, 2013, at 16:28 , "Jeff Squyres (jsquyres)" <jsquy...@cisco.com> 
wrote:

> George --
> 
> Similar question on this one: should it be CMR'ed to v1.7?  (I kinda doubt 
> it's appropriate for v1.6)
> 
> 
> On Jan 21, 2013, at 6:41 AM, svn-commit-mai...@open-mpi.org wrote:
> 
>> Author: bosilca (George Bosilca)
>> Date: 2013-01-21 06:41:08 EST (Mon, 21 Jan 2013)
>> New Revision: 27881
>> URL: https://svn.open-mpi.org/trac/ompi/changeset/27881
>> 
>> Log:
>> Make the TCP BTL really fail-safe. It now trigger the error callback on
>> all pending fragments when the destination goes down. This allows the PML
>> to recalibrate its behavior, either find an alternate route or just give up.
>> 
>> Text files modified: 
>>  trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c |    29 
>> +++++++++++++++++++++++++++--           
>>  trunk/ompi/mca/btl/tcp/btl_tcp_frag.c     |     7 ++++++-                   
>>               
>>  trunk/ompi/mca/btl/tcp/btl_tcp_proc.c     |     2 +-                        
>>               
>>  3 files changed, 34 insertions(+), 4 deletions(-)
>> 
>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c
>> ==============================================================================
>> --- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c        Mon Jan 21 06:35:42 
>> 2013        (r27880)
>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c        2013-01-21 06:41:08 EST 
>> (Mon, 21 Jan 2013)      (r27881)
>> @@ -2,7 +2,7 @@
>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
>> *                         University Research and Technology
>> *                         Corporation.  All rights reserved.
>> - * Copyright (c) 2004-2008 The University of Tennessee and The University
>> + * Copyright (c) 2004-2013 The University of Tennessee and The University
>> *                         of Tennessee Research Foundation.  All rights
>> *                         reserved.
>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
>> @@ -295,6 +295,7 @@
>>            if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && 
>> opal_socket_errno != EWOULDBLOCK) {
>>                BTL_ERROR(("send() failed: %s (%d)",
>>                           strerror(opal_socket_errno), opal_socket_errno));
>> +                btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>                mca_btl_tcp_endpoint_close(btl_endpoint);
>>                return -1;
>>            }
>> @@ -359,6 +360,7 @@
>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>        btl_endpoint->endpoint_sd = sd;
>>        if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != 
>> OMPI_SUCCESS) {
>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>            mca_btl_tcp_endpoint_close(btl_endpoint);
>>            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
>>            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
>> @@ -389,7 +391,6 @@
>> {
>>    if(btl_endpoint->endpoint_sd < 0)
>>        return;
>> -    btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
>>    btl_endpoint->endpoint_retries++;
>>    opal_event_del(&btl_endpoint->endpoint_recv_event);
>>    opal_event_del(&btl_endpoint->endpoint_send_event);
>> @@ -401,6 +402,24 @@
>>    btl_endpoint->endpoint_cache_pos    = NULL;
>>    btl_endpoint->endpoint_cache_length = 0;
>> #endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
>> +    /**
>> +     * If we keep failing to connect to the peer let the caller know about
>> +     * this situation by triggering all the pending fragments callback and
>> +     * reporting the error.
>> +     */
>> +    if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
>> +        mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
>> +        if( NULL == frag ) 
>> +            frag = 
>> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
>> +        while(NULL != frag) {
>> +            frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, 
>> &frag->base, OMPI_ERR_UNREACH);
>> +
>> +            frag = 
>> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
>> +        }
>> +    } else {
>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
>> +    }
>> +
>> }
>> 
>> /*
>> @@ -444,6 +463,7 @@
>> 
>>        /* remote closed connection */
>>        if(retval == 0) {
>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>            mca_btl_tcp_endpoint_close(btl_endpoint);
>>            return -1;
>>        }
>> @@ -453,6 +473,7 @@
>>            if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && 
>> opal_socket_errno != EWOULDBLOCK) {
>>                BTL_ERROR(("recv(%d) failed: %s (%d)",
>>                           btl_endpoint->endpoint_sd, 
>> strerror(opal_socket_errno), opal_socket_errno));
>> +                btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>                mca_btl_tcp_endpoint_close(btl_endpoint);
>>                return -1;
>>            }
>> @@ -589,6 +610,7 @@
>>                            address,
>>                           btl_endpoint->endpoint_addr->addr_port, 
>> strerror(opal_socket_errno) ) );
>>        }
>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>        btl_endpoint->endpoint_retries++;
>>        return OMPI_ERR_UNREACH;
>> @@ -599,6 +621,7 @@
>>        btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
>>        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
>>    } else {
>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>    }
>>    return rc;
>> @@ -645,6 +668,7 @@
>>        btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
>>        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
>>    } else {
>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>    }
>> }
>> @@ -747,6 +771,7 @@
>>    default:
>>        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
>>        BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state));
>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>        break;
>>    }
>> 
>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_frag.c
>> ==============================================================================
>> --- trunk/ompi/mca/btl/tcp/btl_tcp_frag.c    Mon Jan 21 06:35:42 2013        
>> (r27880)
>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_frag.c    2013-01-21 06:41:08 EST (Mon, 
>> 21 Jan 2013)      (r27881)
>> @@ -2,7 +2,7 @@
>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
>> *                         University Research and Technology
>> *                         Corporation.  All rights reserved.
>> - * Copyright (c) 2004-2006 The University of Tennessee and The University
>> + * Copyright (c) 2004-2013 The University of Tennessee and The University
>> *                         of Tennessee Research Foundation.  All rights
>> *                         reserved.
>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
>> @@ -115,12 +115,14 @@
>>                BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, 
>> %lu)\n\t%s(%lu)\n",
>>                    frag->iov_ptr[0].iov_base, (unsigned long) 
>> frag->iov_ptr[0].iov_len,
>>                    strerror(opal_socket_errno), (unsigned long) 
>> frag->iov_cnt));
>> +                frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>                mca_btl_tcp_endpoint_close(frag->endpoint);
>>                return false;
>>            default:
>>                BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", 
>>                           strerror(opal_socket_errno),
>>                           opal_socket_errno));
>> +                frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>                mca_btl_tcp_endpoint_close(frag->endpoint);
>>                return false;
>>            }
>> @@ -195,6 +197,7 @@
>>        cnt = readv(sd, frag->iov_ptr, num_vecs);
>>      if( 0 < cnt ) goto advance_iov_position;
>>      if( cnt == 0 ) {
>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>          mca_btl_tcp_endpoint_close(btl_endpoint);
>>          return false;
>>      }
>> @@ -207,12 +210,14 @@
>>            BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, 
>> %lu)\n\t%s(%lu)\n",
>>                       frag->iov_ptr[0].iov_base, (unsigned long) 
>> frag->iov_ptr[0].iov_len,
>>                       strerror(opal_socket_errno), (unsigned long) 
>> frag->iov_cnt));
>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>          mca_btl_tcp_endpoint_close(btl_endpoint);
>>          return false;
>>      default:
>>            BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", 
>>                       strerror(opal_socket_errno),
>>                       opal_socket_errno));
>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>          mca_btl_tcp_endpoint_close(btl_endpoint);
>>          return false;
>>      }
>> 
>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_proc.c
>> ==============================================================================
>> --- trunk/ompi/mca/btl/tcp/btl_tcp_proc.c    Mon Jan 21 06:35:42 2013        
>> (r27880)
>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_proc.c    2013-01-21 06:41:08 EST (Mon, 
>> 21 Jan 2013)      (r27881)
>> @@ -680,7 +680,7 @@
>> {
>>    size_t i;
>>    OPAL_THREAD_LOCK(&btl_proc->proc_lock);
>> -    for(i=0; i<btl_proc->proc_endpoint_count; i++) {
>> +    for(i = 0; i < btl_proc->proc_endpoint_count; i++) {
>>        if(btl_proc->proc_endpoints[i] == btl_endpoint) {
>>            memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1,
>>                
>> (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*));
>> _______________________________________________
>> svn-full mailing list
>> svn-f...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
> 
> 
> -- 
> Jeff Squyres
> jsquy...@cisco.com
> For corporate legal information go to: 
> http://www.cisco.com/web/about/doing_business/legal/cri/
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel


Reply via email to