Eric Lantz (HPC) wrote: > We've had so much trouble with this particular opensm issue...and > have a 36-node cluster that repro's the problem readily on which we'd > like to test the fix. We also have a 256-node cluster with > Infinihost interfaces on which I'd like to test. I've added James > Ren to this thread as he's the perf tester here on our team who'll do > the actual work. > > Is there an MSI available (private drop would be fine) which includes: > 1. opensm timer race fix > 2. all the memory leak fixes > 3. RDMA-Read fix in NdEndpoint.cpp > Such that the driver/sm are the best we currently know how to make?
Funny you should ask, I was just in the process of making a snapshot release; Look for it after 5:00PM PDT. I'm not sure about the RDMA-Read fix in NdEndpoint.cpp; if it's HEAD of svn then you will get it. Can someone else speak to this? stan. > > Thanks, > Eric > > > -----Original Message----- > From: [email protected] > [mailto:[email protected]] On Behalf Of Tzachi Dar > Sent: Tuesday, June 29, 2010 12:49 AM > To: Hefty, Sean; [email protected] > Subject: Re: [ofw] [PATCH v2] complib/user: fix timer race conditions > > OK, a very nice trick to use a periodic timer. > > I also believe that stop can be called from a callback since the > criticalsections are re-entrant. > Indeed in that case you will not know if the callback is running > until one will call cl_timer_destroy. > > Thanks > Tzachi > >> -----Original Message----- >> From: [email protected] [mailto:ofw- >> [email protected]] On Behalf Of Hefty, Sean >> Sent: Tuesday, June 29, 2010 12:51 AM >> To: [email protected] >> Subject: [ofw] [PATCH v2] complib/user: fix timer race conditions >> >> This serializes the callbacks for a timer and provides >> synchronization for multi-threaded timer usage. >> >> cl_timer_stop is not allowed to be called from a timer callback, and >> will block until any callback has completed. >> >> cl_timer_start simply calls cl_timer_trim >> >> cl_timer_trim changes the timeout of a queued timer if the new >> timeout time is less than the last set timeout. >> >> Signed-off-by: Sean Hefty <[email protected]> >> --- >> Changes from v1: >> Eliminated TimerQueue. The default one is used. >> Replaced the one-shot timer with a periodic timer. This lets us use >> the ChangeTimerQueueTimer (who names these things, really) function >> to adjust the next timeout. Moved the calculation of the timeout >> time to inside the critical section to improve its accuracy in case >> an attempt to acquire the critical section blocks. >> >> trunk/core/complib/user/cl_timer.c | 83 >> ++++++++++++++++++------- -------- >> trunk/inc/user/complib/cl_timer_osd.h | 5 +- 2 files changed, 47 >> insertions(+), 41 deletions(-) >> >> diff --git a/trunk/core/complib/user/cl_timer.c >> b/trunk/core/complib/user/cl_timer.c >> index 4f2ce0f..5cab378 100644 >> --- a/trunk/core/complib/user/cl_timer.c >> +++ b/trunk/core/complib/user/cl_timer.c >> @@ -32,23 +32,31 @@ >> >> >> #include "complib/cl_timer.h" >> + >> >> +#define CL_MAX_TIME 0xFFFFFFFF >> >> + >> +/* >> + * If timeout_time is 0, the timer has been stopped. + */ >> static void CALLBACK >> __timer_callback( >> IN cl_timer_t* const p_timer, >> IN BOOLEAN timer_signalled ) >> { >> - /* timer_signalled is always TRUE, and has no value. */ >> - CL_ASSERT( timer_signalled ); >> + uint64_t timeout; >> UNUSED_PARAM( timer_signalled ); >> >> + EnterCriticalSection( &p_timer->cb_lock ); >> + EnterCriticalSection( &p_timer->lock ); >> + timeout = p_timer->timeout_time; >> p_timer->timeout_time = 0; >> - p_timer->thread_id = GetCurrentThreadId(); >> + LeaveCriticalSection( &p_timer->lock ); >> >> - (p_timer->pfn_callback)( (void*)p_timer->context ); - >> - p_timer->thread_id = 0; >> + if( timeout ) >> + (p_timer->pfn_callback)( (void*)p_timer->context ); >> + LeaveCriticalSection( &p_timer->cb_lock ); >> } >> >> >> @@ -56,9 +64,7 @@ void >> cl_timer_construct( >> IN cl_timer_t* const p_timer ) >> { >> - p_timer->h_timer = NULL; >> - p_timer->timeout_time = 0; >> - p_timer->thread_id = 0; >> + memset(p_timer, 0, sizeof *p_timer); >> } >> >> >> @@ -76,6 +82,11 @@ cl_timer_init( >> >> p_timer->pfn_callback = pfn_callback; >> p_timer->context = context; >> + InitializeCriticalSection( &p_timer->lock ); >> + InitializeCriticalSection( &p_timer->cb_lock ); >> + if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, >> __timer_callback, + p_timer, CL_MAX_TIME, CL_MAX_TIME, >> WT_EXECUTEINIOTHREAD ) ) + return CL_ERROR; return( >> CL_SUCCESS ); >> } >> >> @@ -86,7 +97,9 @@ cl_timer_destroy( >> { >> CL_ASSERT( p_timer ); >> >> - cl_timer_stop( p_timer ); >> + DeleteTimerQueueTimer( NULL, p_timer->h_timer, >> INVALID_HANDLE_VALUE ); >> + DeleteCriticalSection( &p_timer->lock ); >> + DeleteCriticalSection( &p_timer->cb_lock ); >> } >> >> >> @@ -95,19 +108,7 @@ cl_timer_start( >> IN cl_timer_t* const p_timer, >> IN const uint32_t time_ms ) >> { >> - CL_ASSERT( p_timer ); >> - >> - cl_timer_stop( p_timer ); >> - >> - p_timer->timeout_time = cl_get_time_stamp() + >> (((uint64_t)time_ms) * 1000); >> - >> - if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, >> __timer_callback, >> - p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) ) >> - { >> - return( CL_ERROR ); >> - } >> - >> - return( CL_SUCCESS ); >> + return cl_timer_trim( p_timer, time_ms ); >> } >> >> >> @@ -116,35 +117,41 @@ cl_timer_trim( >> IN cl_timer_t* const p_timer, >> IN const uint32_t time_ms ) >> { >> - uint64_t timeout_time; >> + uint64_t timeout; >> + cl_status_t status = CL_SUCCESS; >> >> CL_ASSERT( p_timer ); >> CL_ASSERT( p_timer->pfn_callback ); >> >> - /* Calculate the timeout time in the timer object. */ >> - timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000); >> - >> - /* Only pull in the timeout time. */ >> - if( p_timer->timeout_time && p_timer->timeout_time < timeout_time ) >> - return( CL_SUCCESS ); >> - >> - return cl_timer_start( p_timer, time_ms ); >> + EnterCriticalSection( &p_timer->lock ); >> + timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000); >> + if ( !p_timer->timeout_time || timeout < p_timer->timeout_time ) >> + { + if( ChangeTimerQueueTimer( NULL, p_timer->h_timer, >> time_ms, >> CL_MAX_TIME ) ) + p_timer->timeout_time = timeout; >> + else >> + status = CL_ERROR; >> + } >> + LeaveCriticalSection( &p_timer->lock ); >> + return status; >> } >> >> >> +/* >> + * Acquire cb_lock to ensure that all callbacks have completed. + */ >> void >> cl_timer_stop( >> IN cl_timer_t* const p_timer ) >> { >> CL_ASSERT( p_timer ); >> >> - if( p_timer->h_timer && p_timer->thread_id != >> GetCurrentThreadId() ) >> - { >> - /* Make sure we block until the timer is cancelled. */ >> - DeleteTimerQueueTimer( NULL, p_timer->h_timer, >> INVALID_HANDLE_VALUE ); >> - p_timer->h_timer = NULL; >> - } >> + EnterCriticalSection( &p_timer->cb_lock ); >> + EnterCriticalSection( &p_timer->lock ); >> p_timer->timeout_time = 0; >> + ChangeTimerQueueTimer( NULL, p_timer->h_timer, CL_MAX_TIME, >> CL_MAX_TIME ); + LeaveCriticalSection( &p_timer->lock ); >> + LeaveCriticalSection( &p_timer->cb_lock ); >> } >> >> >> diff --git a/trunk/inc/user/complib/cl_timer_osd.h >> b/trunk/inc/user/complib/cl_timer_osd.h >> index 0a023ba..a4a1195 100644 >> --- a/trunk/inc/user/complib/cl_timer_osd.h >> +++ b/trunk/inc/user/complib/cl_timer_osd.h >> @@ -38,15 +38,14 @@ >> >> #include "cl_types.h" >> >> - >> -/* Timer object definition. */ >> typedef struct _cl_timer >> { >> HANDLE h_timer; >> cl_pfn_timer_callback_t pfn_callback; >> const void *context; >> uint64_t timeout_time; >> - DWORD thread_id; >> + CRITICAL_SECTION lock; >> + CRITICAL_SECTION cb_lock; >> >> } cl_timer_t; >> > > _______________________________________________ > ofw mailing list > [email protected] > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw > > _______________________________________________ > ofw mailing list > [email protected] > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw _______________________________________________ ofw mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
