IBAL: Add exponential backoff to SA queries if timeout specified is negative. The upper half of the timeout is the maximum delay, so that exponential backoff is capped. The bottom half is the starting delay. A random (per host) jitter is added to the timeout so that concurrent queries (as you might see with an MPI all-to-all) don't flood the SA.
Signed-off-by: Fab Tillier <[email protected]> diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\al_mad.c .\core\al\al_mad.c --- \dev\openib\Mellanox\11011\core\al\al_mad.c Thu Sep 20 17:51:04 2012 +++ .\core\al\al_mad.c Tue Oct 09 08:49:38 2012 @@ -32,6 +32,7 @@ #include <iba/ib_al.h> #include <complib/cl_byteswap.h> #include <complib/cl_timer.h> +#include <limits.h> #include "al.h" #include "al_debug.h" @@ -178,9 +179,10 @@ __cleanup_mad_send( IN ib_mad_send_handle_t h_send, IN uint16_t ctx); -static __inline void +static __inline int __set_retry_time( - IN ib_mad_send_handle_t h_send ); + IN ib_mad_send_handle_t h_send, + IN ULONG send_jitter ); static void __mad_svc_send_done( @@ -1071,6 +1073,7 @@ reg_mad_svc( ib_mad_svc_handle_t h_mad_svc; al_qp_alias_t *p_qp_alias; ib_qp_attr_t qp_attr; + static ULONG seed = 0; AL_ENTER( AL_DBG_MAD_SVC ); CL_ASSERT( h_qp ); @@ -1108,6 +1111,14 @@ reg_mad_svc( cl_qlist_init( &h_mad_svc->send_list ); cl_qlist_init( &h_mad_svc->recv_list ); + if( seed == 0 ) + { + seed = (ULONG)(ULONG_PTR)p_mad_svc; + } +#ifdef CL_KERNEL + h_mad_svc->send_jitter = RtlRandomEx( &seed ); +#endif + p_qp_alias = PARENT_STRUCT( h_qp, al_qp_alias_t, qp ); h_mad_svc->svc_type = p_mad_svc->svc_type; h_mad_svc->obj.context = p_mad_svc->mad_svc_context; @@ -1967,9 +1976,8 @@ __mad_svc_send_done( ("waiting for response for TID:0x%I64x\n", __get_send_tid( h_send )) ); - __set_retry_time( h_send ); cl_timer_trim( &h_mad_svc->send_timer, - h_send->p_send_mad->timeout_ms ); + __set_retry_time( h_send, h_mad_svc->send_jitter ) ); } cl_spinlock_release( &h_mad_svc->obj.lock ); } @@ -2962,14 +2972,51 @@ __process_rmpp_nack( -static __inline void +static __inline int __set_retry_time( - IN ib_mad_send_handle_t h_send ) + IN ib_mad_send_handle_t h_send, + IN ULONG send_jitter ) { - h_send->retry_time = - (uint64_t)(h_send->p_send_mad->timeout_ms + h_send->delay) * 1000Ui64 + - cl_get_time_stamp(); + int timeout = (int)h_send->p_send_mad->timeout_ms; + + // + // Negative values indicate recursive doubling. + // + if( timeout < 0 ) + { + int max; + timeout = -timeout; + max = timeout >> 16; + timeout &= 0xFFFFUL; + + if( max == 0 ) + { + max = SHRT_MAX; + } + + if( (timeout * 2) <= max ) + { + // + // Double the timeout for the next iteration. + // + h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | (timeout * 2)); + } + else + { + h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | max); + } + } + + // + // Add some jitter, random number between 0 and 1/2 timeout. + // Note that this is in microseconds and not milliseconds. + // + timeout += (send_jitter % timeout) / 2; + timeout += h_send->delay; + + h_send->retry_time = (uint64_t)(timeout) * 1000Ui64 + cl_get_time_stamp(); h_send->delay = 0; + return timeout; } @@ -3076,9 +3123,8 @@ __check_send_queue( else { /* The send was delivered. Continue waiting. */ - __set_retry_time( h_send ); cl_timer_trim( &h_mad_svc->send_timer, - ((uint32_t)(h_send->retry_time - cur_time) / 1000) ); + __set_retry_time( h_send, h_mad_svc->send_jitter ) ); } } else diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\al_mad.h .\core\al\al_mad.h --- \dev\openib\Mellanox\11011\core\al\al_mad.h Thu Sep 20 17:51:04 2012 +++ .\core\al\al_mad.h Thu Oct 04 14:19:36 2012 @@ -148,6 +148,7 @@ typedef struct _al_mad_svc cl_qlist_t send_list; cl_timer_t send_timer; + ULONG send_jitter; cl_qlist_t recv_list; cl_timer_t recv_timer; diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c .\core\al\kernel\al_cm_cep.c --- \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c Thu Sep 20 17:51:02 2012 +++ .\core\al\kernel\al_cm_cep.c Tue Oct 16 14:53:19 2012 @@ -36,6 +36,7 @@ #include <complib/cl_spinlock.h> #include <iba/ib_al_ifc.h> #include <iba/ib_cm_ifc.h> +#include <limits.h> #include "al_common.h" #include "al_cm_cep.h" #include "al_cm_conn.h" @@ -3612,7 +3613,7 @@ __calc_mad_timeout( * trap exceedingly large values to prevent wrapping. */ if( pkt_life > 39 ) - return ~0UL; + return INT_MAX; if( pkt_life > 14 ) return 67 << (pkt_life - 14); else if( pkt_life > 8 ) diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h .\inc\kernel\complib\cl_types_osd.h --- \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h Thu Sep 20 17:51:06 2012 +++ .\inc\kernel\complib\cl_types_osd.h Wed Sep 19 14:54:03 2012 @@ -49,7 +49,10 @@ extern "C" #define CL_NTDDK #endif /* NDIS_WDM */ #elif !defined( _MINIPORT_ ) +#ifndef _NTDDK_ +#include <ntifs.h> #include <ntddk.h> +#endif #define CL_NTDDK #endif /* defined( NDIS_MINIPORT_DRIVER ) */ #pragma warning( pop )
ndv2.56.patch
Description: ndv2.56.patch
_______________________________________________ ofw mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
