Smith, Stan wrote: > Eric Lantz (HPC) wrote: >> We've had so much trouble with this particular opensm issue...and >> have a 36-node cluster that repro's the problem readily on which we'd >> like to test the fix. We also have a 256-node cluster with >> Infinihost interfaces on which I'd like to test. I've added James >> Ren to this thread as he's the perf tester here on our team who'll >> do the actual work. >> >> Is there an MSI available (private drop would be fine) which >> includes: >> 1. opensm timer race fix >> 2. all the memory leak fixes >> 3. RDMA-Read fix in NdEndpoint.cpp >> Such that the driver/sm are the best we currently know how to make? > > Funny you should ask, I was just in the process of making a snapshot > release; Look for it after 5:00PM PDT. I'm not sure about the > RDMA-Read fix in NdEndpoint.cpp; if it's HEAD of svn then you will > get it. > Can someone else speak to this? > > stan.
Including #3 (thanks Fab) please find svn.2843 based installers @ http://www.openfabrics.org/downloads/Windows/SnapShots/v2843/ enjoy, stan. > >> >> Thanks, >> Eric >> >> >> -----Original Message----- >> From: [email protected] >> [mailto:[email protected]] On Behalf Of Tzachi Dar >> Sent: Tuesday, June 29, 2010 12:49 AM >> To: Hefty, Sean; [email protected] >> Subject: Re: [ofw] [PATCH v2] complib/user: fix timer race conditions >> >> OK, a very nice trick to use a periodic timer. >> >> I also believe that stop can be called from a callback since the >> criticalsections are re-entrant. >> Indeed in that case you will not know if the callback is running >> until one will call cl_timer_destroy. >> >> Thanks >> Tzachi >> >>> -----Original Message----- >>> From: [email protected] [mailto:ofw- >>> [email protected]] On Behalf Of Hefty, Sean >>> Sent: Tuesday, June 29, 2010 12:51 AM >>> To: [email protected] >>> Subject: [ofw] [PATCH v2] complib/user: fix timer race conditions >>> >>> This serializes the callbacks for a timer and provides >>> synchronization for multi-threaded timer usage. >>> >>> cl_timer_stop is not allowed to be called from a timer callback, and >>> will block until any callback has completed. >>> >>> cl_timer_start simply calls cl_timer_trim >>> >>> cl_timer_trim changes the timeout of a queued timer if the new >>> timeout time is less than the last set timeout. >>> >>> Signed-off-by: Sean Hefty <[email protected]> >>> --- >>> Changes from v1: >>> Eliminated TimerQueue. The default one is used. >>> Replaced the one-shot timer with a periodic timer. This lets us use >>> the ChangeTimerQueueTimer (who names these things, really) function >>> to adjust the next timeout. Moved the calculation of the timeout >>> time to inside the critical section to improve its accuracy in case >>> an attempt to acquire the critical section blocks. >>> >>> trunk/core/complib/user/cl_timer.c | 83 >>> ++++++++++++++++++------- -------- >>> trunk/inc/user/complib/cl_timer_osd.h | 5 +- 2 files changed, 47 >>> insertions(+), 41 deletions(-) >>> >>> diff --git a/trunk/core/complib/user/cl_timer.c >>> b/trunk/core/complib/user/cl_timer.c >>> index 4f2ce0f..5cab378 100644 >>> --- a/trunk/core/complib/user/cl_timer.c >>> +++ b/trunk/core/complib/user/cl_timer.c >>> @@ -32,23 +32,31 @@ >>> >>> >>> #include "complib/cl_timer.h" >>> + >>> >>> +#define CL_MAX_TIME 0xFFFFFFFF >>> >>> + >>> +/* >>> + * If timeout_time is 0, the timer has been stopped. + */ static >>> void CALLBACK __timer_callback( >>> IN cl_timer_t* const p_timer, >>> IN BOOLEAN timer_signalled ) >>> { >>> - /* timer_signalled is always TRUE, and has no value. */ >>> - CL_ASSERT( timer_signalled ); >>> + uint64_t timeout; >>> UNUSED_PARAM( timer_signalled ); >>> >>> + EnterCriticalSection( &p_timer->cb_lock ); >>> + EnterCriticalSection( &p_timer->lock ); >>> + timeout = p_timer->timeout_time; >>> p_timer->timeout_time = 0; >>> - p_timer->thread_id = GetCurrentThreadId(); >>> + LeaveCriticalSection( &p_timer->lock ); >>> >>> - (p_timer->pfn_callback)( (void*)p_timer->context ); - >>> - p_timer->thread_id = 0; >>> + if( timeout ) >>> + (p_timer->pfn_callback)( (void*)p_timer->context ); >>> + LeaveCriticalSection( &p_timer->cb_lock ); >>> } >>> >>> >>> @@ -56,9 +64,7 @@ void >>> cl_timer_construct( >>> IN cl_timer_t* const p_timer ) >>> { >>> - p_timer->h_timer = NULL; >>> - p_timer->timeout_time = 0; >>> - p_timer->thread_id = 0; >>> + memset(p_timer, 0, sizeof *p_timer); >>> } >>> >>> >>> @@ -76,6 +82,11 @@ cl_timer_init( >>> >>> p_timer->pfn_callback = pfn_callback; >>> p_timer->context = context; >>> + InitializeCriticalSection( &p_timer->lock ); >>> + InitializeCriticalSection( &p_timer->cb_lock ); >>> + if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, >>> __timer_callback, + p_timer, CL_MAX_TIME, CL_MAX_TIME, >>> WT_EXECUTEINIOTHREAD ) ) + return CL_ERROR; return( >>> CL_SUCCESS >>> ); } >>> >>> @@ -86,7 +97,9 @@ cl_timer_destroy( >>> { >>> CL_ASSERT( p_timer ); >>> >>> - cl_timer_stop( p_timer ); >>> + DeleteTimerQueueTimer( NULL, p_timer->h_timer, >>> INVALID_HANDLE_VALUE ); >>> + DeleteCriticalSection( &p_timer->lock ); >>> + DeleteCriticalSection( &p_timer->cb_lock ); >>> } >>> >>> >>> @@ -95,19 +108,7 @@ cl_timer_start( >>> IN cl_timer_t* const p_timer, >>> IN const uint32_t time_ms ) >>> { >>> - CL_ASSERT( p_timer ); >>> - >>> - cl_timer_stop( p_timer ); >>> - >>> - p_timer->timeout_time = cl_get_time_stamp() + >>> (((uint64_t)time_ms) * 1000); >>> - >>> - if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, >>> __timer_callback, >>> - p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) ) >>> - { >>> - return( CL_ERROR ); >>> - } >>> - >>> - return( CL_SUCCESS ); >>> + return cl_timer_trim( p_timer, time_ms ); >>> } >>> >>> >>> @@ -116,35 +117,41 @@ cl_timer_trim( >>> IN cl_timer_t* const p_timer, >>> IN const uint32_t time_ms ) >>> { >>> - uint64_t timeout_time; >>> + uint64_t timeout; >>> + cl_status_t status = CL_SUCCESS; >>> >>> CL_ASSERT( p_timer ); >>> CL_ASSERT( p_timer->pfn_callback ); >>> >>> - /* Calculate the timeout time in the timer object. */ >>> - timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * >>> 1000); - >>> - /* Only pull in the timeout time. */ >>> - if( p_timer->timeout_time && p_timer->timeout_time < timeout_time >>> ) >>> - return( CL_SUCCESS ); >>> - >>> - return cl_timer_start( p_timer, time_ms ); >>> + EnterCriticalSection( &p_timer->lock ); >>> + timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000); >>> + if ( !p_timer->timeout_time || timeout < p_timer->timeout_time ) >>> + { + if( ChangeTimerQueueTimer( NULL, p_timer->h_timer, >>> time_ms, >>> CL_MAX_TIME ) ) + p_timer->timeout_time = timeout; + >>> else >>> + status = CL_ERROR; >>> + } >>> + LeaveCriticalSection( &p_timer->lock ); >>> + return status; >>> } >>> >>> >>> +/* >>> + * Acquire cb_lock to ensure that all callbacks have completed. + >>> */ void cl_timer_stop( >>> IN cl_timer_t* const p_timer ) >>> { >>> CL_ASSERT( p_timer ); >>> >>> - if( p_timer->h_timer && p_timer->thread_id != >>> GetCurrentThreadId() ) >>> - { >>> - /* Make sure we block until the timer is cancelled. */ >>> - DeleteTimerQueueTimer( NULL, p_timer->h_timer, >>> INVALID_HANDLE_VALUE ); >>> - p_timer->h_timer = NULL; >>> - } >>> + EnterCriticalSection( &p_timer->cb_lock ); >>> + EnterCriticalSection( &p_timer->lock ); >>> p_timer->timeout_time = 0; >>> + ChangeTimerQueueTimer( NULL, p_timer->h_timer, CL_MAX_TIME, >>> CL_MAX_TIME ); + LeaveCriticalSection( &p_timer->lock ); >>> + LeaveCriticalSection( &p_timer->cb_lock ); >>> } >>> >>> >>> diff --git a/trunk/inc/user/complib/cl_timer_osd.h >>> b/trunk/inc/user/complib/cl_timer_osd.h >>> index 0a023ba..a4a1195 100644 >>> --- a/trunk/inc/user/complib/cl_timer_osd.h >>> +++ b/trunk/inc/user/complib/cl_timer_osd.h >>> @@ -38,15 +38,14 @@ >>> >>> #include "cl_types.h" >>> >>> - >>> -/* Timer object definition. */ >>> typedef struct _cl_timer >>> { >>> HANDLE h_timer; >>> cl_pfn_timer_callback_t pfn_callback; >>> const void *context; >>> uint64_t timeout_time; >>> - DWORD thread_id; >>> + CRITICAL_SECTION lock; >>> + CRITICAL_SECTION cb_lock; >>> >>> } cl_timer_t; >>> >> >> _______________________________________________ >> ofw mailing list >> [email protected] >> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw >> >> _______________________________________________ >> ofw mailing list >> [email protected] >> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw _______________________________________________ ofw mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
