IBAL: Add exponential backoff to SA queries if timeout specified is negative.  
The upper half of the timeout is the maximum delay, so that exponential backoff 
is capped.  The bottom half is the starting delay.  A random (per host) jitter 
is added to the timeout so that concurrent queries (as you might see with an 
MPI all-to-all) don't flood the SA.

Signed-off-by: Fab Tillier <[email protected]>

diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r 
\dev\openib\Mellanox\11011\core\al\al_mad.c .\core\al\al_mad.c
--- \dev\openib\Mellanox\11011\core\al\al_mad.c Thu Sep 20 17:51:04 2012
+++ .\core\al\al_mad.c  Tue Oct 09 08:49:38 2012
@@ -32,6 +32,7 @@
 #include <iba/ib_al.h>
 #include <complib/cl_byteswap.h>
 #include <complib/cl_timer.h>
+#include <limits.h>
 
 #include "al.h"
 #include "al_debug.h"
@@ -178,9 +179,10 @@ __cleanup_mad_send(
        IN                              ib_mad_send_handle_t            h_send,
        IN                              uint16_t                                
        ctx);
 
-static __inline void
+static __inline int
 __set_retry_time(
-       IN                              ib_mad_send_handle_t            h_send 
);
+       IN                              ib_mad_send_handle_t            h_send,
+    IN              ULONG                       send_jitter );
 
 static void
 __mad_svc_send_done(
@@ -1071,6 +1073,7 @@ reg_mad_svc(
        ib_mad_svc_handle_t     h_mad_svc;
        al_qp_alias_t           *p_qp_alias;
        ib_qp_attr_t            qp_attr;
+    static ULONG        seed = 0;
 
        AL_ENTER( AL_DBG_MAD_SVC );
        CL_ASSERT( h_qp );
@@ -1108,6 +1111,14 @@ reg_mad_svc(
        cl_qlist_init( &h_mad_svc->send_list );
        cl_qlist_init( &h_mad_svc->recv_list );
 
+    if( seed == 0 )
+    {
+        seed = (ULONG)(ULONG_PTR)p_mad_svc;
+    }
+#ifdef CL_KERNEL
+    h_mad_svc->send_jitter = RtlRandomEx( &seed );
+#endif
+
        p_qp_alias = PARENT_STRUCT( h_qp, al_qp_alias_t, qp );
        h_mad_svc->svc_type = p_mad_svc->svc_type;
        h_mad_svc->obj.context = p_mad_svc->mad_svc_context;
@@ -1967,9 +1976,8 @@ __mad_svc_send_done(
                                ("waiting for response for TID:0x%I64x\n",
                                __get_send_tid( h_send )) );
 
-                       __set_retry_time( h_send );
                        cl_timer_trim( &h_mad_svc->send_timer,
-                               h_send->p_send_mad->timeout_ms );
+                               __set_retry_time( h_send, 
h_mad_svc->send_jitter ) );
                }
                cl_spinlock_release( &h_mad_svc->obj.lock );
        }
@@ -2962,14 +2972,51 @@ __process_rmpp_nack(
 
 
 
-static __inline void
+static __inline int
 __set_retry_time(
-       IN                              ib_mad_send_handle_t            h_send )
+       IN                              ib_mad_send_handle_t            h_send,
+    IN              ULONG                       send_jitter )
 {
-       h_send->retry_time =
-               (uint64_t)(h_send->p_send_mad->timeout_ms + h_send->delay) * 
1000Ui64 +
-               cl_get_time_stamp();
+    int timeout = (int)h_send->p_send_mad->timeout_ms;
+
+    //
+    // Negative values indicate recursive doubling.
+    //
+    if( timeout < 0 )
+    {
+        int max;
+        timeout = -timeout;
+        max = timeout >> 16;
+        timeout &= 0xFFFFUL;
+
+        if( max == 0 )
+        {
+            max = SHRT_MAX;
+        }
+
+        if( (timeout * 2) <= max )
+        {
+            //
+            // Double the timeout for the next iteration.
+            //
+            h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | (timeout * 
2));
+        }
+        else
+        {
+            h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | max);
+        }
+    }
+
+    //
+    // Add some jitter, random number between 0 and 1/2 timeout.
+    // Note that this is in microseconds and not milliseconds.
+    //
+    timeout += (send_jitter % timeout) / 2;
+    timeout += h_send->delay;
+
+       h_send->retry_time = (uint64_t)(timeout) * 1000Ui64 + 
cl_get_time_stamp();
        h_send->delay = 0;
+    return timeout;
 }
 
 
@@ -3076,9 +3123,8 @@ __check_send_queue(
                                else
                                {
                                        /* The send was delivered.  Continue 
waiting. */
-                                       __set_retry_time( h_send );
                                        cl_timer_trim( &h_mad_svc->send_timer,
-                                               ((uint32_t)(h_send->retry_time 
- cur_time) / 1000) );
+                                               __set_retry_time( h_send, 
h_mad_svc->send_jitter ) );
                                }
                        }
                        else
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r 
\dev\openib\Mellanox\11011\core\al\al_mad.h .\core\al\al_mad.h
--- \dev\openib\Mellanox\11011\core\al\al_mad.h Thu Sep 20 17:51:04 2012
+++ .\core\al\al_mad.h  Thu Oct 04 14:19:36 2012
@@ -148,6 +148,7 @@ typedef struct _al_mad_svc
 
        cl_qlist_t                                      send_list;
        cl_timer_t                                      send_timer;
+    ULONG                       send_jitter;
 
        cl_qlist_t                                      recv_list;
        cl_timer_t                                      recv_timer;
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r 
\dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c 
.\core\al\kernel\al_cm_cep.c
--- \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c       Thu Sep 20 
17:51:02 2012
+++ .\core\al\kernel\al_cm_cep.c        Tue Oct 16 14:53:19 2012
@@ -36,6 +36,7 @@
 #include <complib/cl_spinlock.h>
 #include <iba/ib_al_ifc.h>
 #include <iba/ib_cm_ifc.h>
+#include <limits.h>
 #include "al_common.h"
 #include "al_cm_cep.h"
 #include "al_cm_conn.h"
@@ -3612,7 +3613,7 @@ __calc_mad_timeout(
         * trap exceedingly large values to prevent wrapping.
         */
        if( pkt_life > 39 )
-               return ~0UL;
+               return INT_MAX;
        if( pkt_life > 14 )
                return 67 << (pkt_life - 14);
        else if( pkt_life > 8 )
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r 
\dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h 
.\inc\kernel\complib\cl_types_osd.h
--- \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h        Thu Sep 
20 17:51:06 2012
+++ .\inc\kernel\complib\cl_types_osd.h Wed Sep 19 14:54:03 2012
@@ -49,7 +49,10 @@ extern "C"
 #define CL_NTDDK
 #endif /* NDIS_WDM */
 #elif !defined( _MINIPORT_ )
+#ifndef _NTDDK_
+#include <ntifs.h>
 #include <ntddk.h>
+#endif
 #define CL_NTDDK
 #endif /* defined( NDIS_MINIPORT_DRIVER ) */
 #pragma warning( pop )

Attachment: ndv2.56.patch
Description: ndv2.56.patch

_______________________________________________
ofw mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw

Reply via email to