While we always strive to improve this functionality, it was available as a 
separate software packages for quite some time.

  George.


On Jan 23, 2013, at 08:05 , Jeff Squyres (jsquyres) <jsquy...@cisco.com> wrote:

> Are you going to develop anything further with regards to this functionality, 
> and target that stuff for v1.7?  Or should all of this just wait until 1.9?
> 
> (I don't really care either way; I'm asking out of curiosity)
> 
> 
> On Jan 22, 2013, at 7:24 PM, George Bosilca <bosi...@icl.utk.edu> wrote:
> 
>> Nobody cared about error cases so far, I don't personally see any incentive 
>> to push this patch in the 1.7 right now. But I won't be against as it is not 
>> hurting either.
>> 
>> George.
>> 
>> 
>> On Jan 22, 2013, at 16:28 , "Jeff Squyres (jsquyres)" <jsquy...@cisco.com> 
>> wrote:
>> 
>>> George --
>>> 
>>> Similar question on this one: should it be CMR'ed to v1.7?  (I kinda doubt 
>>> it's appropriate for v1.6)
>>> 
>>> 
>>> On Jan 21, 2013, at 6:41 AM, svn-commit-mai...@open-mpi.org wrote:
>>> 
>>>> Author: bosilca (George Bosilca)
>>>> Date: 2013-01-21 06:41:08 EST (Mon, 21 Jan 2013)
>>>> New Revision: 27881
>>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/27881
>>>> 
>>>> Log:
>>>> Make the TCP BTL really fail-safe. It now trigger the error callback on
>>>> all pending fragments when the destination goes down. This allows the PML
>>>> to recalibrate its behavior, either find an alternate route or just give 
>>>> up.
>>>> 
>>>> Text files modified: 
>>>> trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c |    29 
>>>> +++++++++++++++++++++++++++--           
>>>> trunk/ompi/mca/btl/tcp/btl_tcp_frag.c     |     7 ++++++-                  
>>>>                
>>>> trunk/ompi/mca/btl/tcp/btl_tcp_proc.c     |     2 +-                       
>>>>                
>>>> 3 files changed, 34 insertions(+), 4 deletions(-)
>>>> 
>>>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c
>>>> ==============================================================================
>>>> --- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c      Mon Jan 21 06:35:42 
>>>> 2013        (r27880)
>>>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c      2013-01-21 06:41:08 EST 
>>>> (Mon, 21 Jan 2013)      (r27881)
>>>> @@ -2,7 +2,7 @@
>>>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
>>>> *                         University Research and Technology
>>>> *                         Corporation.  All rights reserved.
>>>> - * Copyright (c) 2004-2008 The University of Tennessee and The University
>>>> + * Copyright (c) 2004-2013 The University of Tennessee and The University
>>>> *                         of Tennessee Research Foundation.  All rights
>>>> *                         reserved.
>>>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
>>>> @@ -295,6 +295,7 @@
>>>>          if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && 
>>>> opal_socket_errno != EWOULDBLOCK) {
>>>>              BTL_ERROR(("send() failed: %s (%d)",
>>>>                         strerror(opal_socket_errno), opal_socket_errno));
>>>> +                btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>              mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>              return -1;
>>>>          }
>>>> @@ -359,6 +360,7 @@
>>>>      mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>      btl_endpoint->endpoint_sd = sd;
>>>>      if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != 
>>>> OMPI_SUCCESS) {
>>>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>          mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>          OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
>>>>          OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
>>>> @@ -389,7 +391,6 @@
>>>> {
>>>>  if(btl_endpoint->endpoint_sd < 0)
>>>>      return;
>>>> -    btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
>>>>  btl_endpoint->endpoint_retries++;
>>>>  opal_event_del(&btl_endpoint->endpoint_recv_event);
>>>>  opal_event_del(&btl_endpoint->endpoint_send_event);
>>>> @@ -401,6 +402,24 @@
>>>>  btl_endpoint->endpoint_cache_pos    = NULL;
>>>>  btl_endpoint->endpoint_cache_length = 0;
>>>> #endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
>>>> +    /**
>>>> +     * If we keep failing to connect to the peer let the caller know about
>>>> +     * this situation by triggering all the pending fragments callback and
>>>> +     * reporting the error.
>>>> +     */
>>>> +    if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
>>>> +        mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
>>>> +        if( NULL == frag ) 
>>>> +            frag = 
>>>> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
>>>> +        while(NULL != frag) {
>>>> +            frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, 
>>>> &frag->base, OMPI_ERR_UNREACH);
>>>> +
>>>> +            frag = 
>>>> (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
>>>> +        }
>>>> +    } else {
>>>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
>>>> +    }
>>>> +
>>>> }
>>>> 
>>>> /*
>>>> @@ -444,6 +463,7 @@
>>>> 
>>>>      /* remote closed connection */
>>>>      if(retval == 0) {
>>>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>          mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>          return -1;
>>>>      }
>>>> @@ -453,6 +473,7 @@
>>>>          if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && 
>>>> opal_socket_errno != EWOULDBLOCK) {
>>>>              BTL_ERROR(("recv(%d) failed: %s (%d)",
>>>>                         btl_endpoint->endpoint_sd, 
>>>> strerror(opal_socket_errno), opal_socket_errno));
>>>> +                btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>              mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>              return -1;
>>>>          }
>>>> @@ -589,6 +610,7 @@
>>>>                          address,
>>>>                         btl_endpoint->endpoint_addr->addr_port, 
>>>> strerror(opal_socket_errno) ) );
>>>>      }
>>>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>      mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>      btl_endpoint->endpoint_retries++;
>>>>      return OMPI_ERR_UNREACH;
>>>> @@ -599,6 +621,7 @@
>>>>      btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
>>>>      opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
>>>>  } else {
>>>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>      mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>  }
>>>>  return rc;
>>>> @@ -645,6 +668,7 @@
>>>>      btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
>>>>      opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
>>>>  } else {
>>>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>      mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>  }
>>>> }
>>>> @@ -747,6 +771,7 @@
>>>>  default:
>>>>      OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
>>>>      BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state));
>>>> +        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>      mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>      break;
>>>>  }
>>>> 
>>>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_frag.c
>>>> ==============================================================================
>>>> --- trunk/ompi/mca/btl/tcp/btl_tcp_frag.c  Mon Jan 21 06:35:42 2013        
>>>> (r27880)
>>>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_frag.c  2013-01-21 06:41:08 EST (Mon, 
>>>> 21 Jan 2013)      (r27881)
>>>> @@ -2,7 +2,7 @@
>>>> * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
>>>> *                         University Research and Technology
>>>> *                         Corporation.  All rights reserved.
>>>> - * Copyright (c) 2004-2006 The University of Tennessee and The University
>>>> + * Copyright (c) 2004-2013 The University of Tennessee and The University
>>>> *                         of Tennessee Research Foundation.  All rights
>>>> *                         reserved.
>>>> * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
>>>> @@ -115,12 +115,14 @@
>>>>              BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, 
>>>> %lu)\n\t%s(%lu)\n",
>>>>                  frag->iov_ptr[0].iov_base, (unsigned long) 
>>>> frag->iov_ptr[0].iov_len,
>>>>                  strerror(opal_socket_errno), (unsigned long) 
>>>> frag->iov_cnt));
>>>> +                frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>              mca_btl_tcp_endpoint_close(frag->endpoint);
>>>>              return false;
>>>>          default:
>>>>              BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", 
>>>>                         strerror(opal_socket_errno),
>>>>                         opal_socket_errno));
>>>> +                frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>              mca_btl_tcp_endpoint_close(frag->endpoint);
>>>>              return false;
>>>>          }
>>>> @@ -195,6 +197,7 @@
>>>>      cnt = readv(sd, frag->iov_ptr, num_vecs);
>>>>    if( 0 < cnt ) goto advance_iov_position;
>>>>    if( cnt == 0 ) {
>>>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>        return false;
>>>>    }
>>>> @@ -207,12 +210,14 @@
>>>>          BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, 
>>>> %lu)\n\t%s(%lu)\n",
>>>>                     frag->iov_ptr[0].iov_base, (unsigned long) 
>>>> frag->iov_ptr[0].iov_len,
>>>>                     strerror(opal_socket_errno), (unsigned long) 
>>>> frag->iov_cnt));
>>>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>        return false;
>>>>    default:
>>>>          BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", 
>>>>                     strerror(opal_socket_errno),
>>>>                     opal_socket_errno));
>>>> +            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
>>>>        mca_btl_tcp_endpoint_close(btl_endpoint);
>>>>        return false;
>>>>    }
>>>> 
>>>> Modified: trunk/ompi/mca/btl/tcp/btl_tcp_proc.c
>>>> ==============================================================================
>>>> --- trunk/ompi/mca/btl/tcp/btl_tcp_proc.c  Mon Jan 21 06:35:42 2013        
>>>> (r27880)
>>>> +++ trunk/ompi/mca/btl/tcp/btl_tcp_proc.c  2013-01-21 06:41:08 EST (Mon, 
>>>> 21 Jan 2013)      (r27881)
>>>> @@ -680,7 +680,7 @@
>>>> {
>>>>  size_t i;
>>>>  OPAL_THREAD_LOCK(&btl_proc->proc_lock);
>>>> -    for(i=0; i<btl_proc->proc_endpoint_count; i++) {
>>>> +    for(i = 0; i < btl_proc->proc_endpoint_count; i++) {
>>>>      if(btl_proc->proc_endpoints[i] == btl_endpoint) {
>>>>          memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1,
>>>>              
>>>> (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*));
>>>> _______________________________________________
>>>> svn-full mailing list
>>>> svn-f...@open-mpi.org
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
>>> 
>>> 
>>> -- 
>>> Jeff Squyres
>>> jsquy...@cisco.com
>>> For corporate legal information go to: 
>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>> 
>>> 
>>> _______________________________________________
>>> devel mailing list
>>> de...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>> 
>> 
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> 
> 
> -- 
> Jeff Squyres
> jsquy...@cisco.com
> For corporate legal information go to: 
> http://www.cisco.com/web/about/doing_business/legal/cri/
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel


Reply via email to