While debugging a deadlock in the opensm executable, we have found that there 
is a problem in the cl_timer code:

The problem starts from the fact that there can be two call backs that are 
running simultaneously.
On each call back that runs there is the following code:

static void CALLBACK
__timer_callback(
                IN cl_timer_t* const p_timer,
                IN BOOLEAN timer_signalled )
{
                /* timer_signalled is always TRUE, and has no value. */
                CL_ASSERT( timer_signalled );
                UNUSED_PARAM( timer_signalled );

                p_timer->timeout_time = 0;
                p_timer->thread_id = GetCurrentThreadId();

                (p_timer->pfn_callback)( (void*)p_timer->context );

                p_timer->thread_id = 0;
}

This should promise that if a callback is running, p_timer->thread_id  will  
have the thread_id of the running thread.
This field is later used when cl_timer_stop() is being called in order to 
prevent deadlocks. (please note that cl_timer_stop() is being called from 
cl_timer_start(). Please also note that the deadlock happens if the caller of 
cl_stop() is actually executing in the context of the call back itself).

The problem happens when two callbacks will be running and than the 
p_timer->thread_id doesn't show the correct field.

The fix itself is to sync the start and stop calls and now only one callback 
will be running (actually, there is one exception to this code: if from the 
callback, a new timer is started, the new callback might be running on the same 
time, but: (1) stop has already being called. (2) the p_timer->thread_id is not 
being touched after the callback.

Thanks
Tzachi




Index: 
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/core/complib/user/cl_timer.c
===================================================================
--- 
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/core/complib/user/cl_timer.c 
(revision 5977)
+++ 
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/core/complib/user/cl_timer.c 
             (revision 5978)
@@ -33,25 +33,27 @@

 #include "complib/cl_timer.h"

-
 static void CALLBACK
 __timer_callback(
                IN cl_timer_t* const p_timer,
                IN BOOLEAN timer_signalled )
 {
                /* timer_signalled is always TRUE, and has no value. */
+
+             DWORD thread_id = GetCurrentThreadId();
                CL_ASSERT( timer_signalled );
+
                UNUSED_PARAM( timer_signalled );

+             CL_ASSERT(thread_id!=0);
+
                p_timer->timeout_time = 0;
-              p_timer->thread_id = GetCurrentThreadId();
+             p_timer->thread_id = thread_id;

                (p_timer->pfn_callback)( (void*)p_timer->context );

-              p_timer->thread_id = 0;
 }

-
 void
 cl_timer_construct(
                IN           cl_timer_t* const            p_timer )
@@ -76,7 +78,8 @@

                p_timer->pfn_callback = pfn_callback;
                p_timer->context = context;
-              return( CL_SUCCESS );
+             return cl_spinlock_init(&p_timer->spinlock);
+
 }


@@ -97,6 +100,8 @@
 {
                CL_ASSERT( p_timer );

+             cl_spinlock_acquire(&p_timer->spinlock);
+
                cl_timer_stop( p_timer );

                p_timer->timeout_time = cl_get_time_stamp() + 
(((uint64_t)time_ms) * 1000);
@@ -104,9 +109,12 @@
                if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, 
__timer_callback,
                                p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
                {
+
+                             cl_spinlock_release(&p_timer->spinlock);
                                return( CL_ERROR );
                }

+             cl_spinlock_release(&p_timer->spinlock);
                return( CL_SUCCESS );
 }

@@ -117,26 +125,34 @@
                IN           const uint32_t                   time_ms )
 {
                uint64_t                               timeout_time;
+             cl_status_t                          status;

                CL_ASSERT( p_timer );
                CL_ASSERT( p_timer->pfn_callback );

+             cl_spinlock_acquire(&p_timer->spinlock);
+
                /* Calculate the timeout time in the timer object. */
                timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * 
1000);

                /* Only pull in the timeout time. */
-              if( p_timer->timeout_time && p_timer->timeout_time < 
timeout_time )
+             if( p_timer->timeout_time && p_timer->timeout_time < timeout_time 
) {
+                             cl_spinlock_release(&p_timer->spinlock);
                                return( CL_SUCCESS );
+             }

-              return cl_timer_start( p_timer, time_ms );
+             status = cl_timer_start( p_timer, time_ms );
+
+             cl_spinlock_release(&p_timer->spinlock);
+             return status;
 }

-
 void
 cl_timer_stop(
                IN           cl_timer_t* const            p_timer )
 {
                CL_ASSERT( p_timer );
+             cl_spinlock_acquire(&p_timer->spinlock);

                if( p_timer->h_timer && p_timer->thread_id != 
GetCurrentThreadId() )
                {
@@ -145,6 +161,8 @@
                                p_timer->h_timer = NULL;
                }
                p_timer->timeout_time = 0;
+
+             cl_spinlock_release(&p_timer->spinlock);
 }


Index: B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/complib/cl_timer.h
===================================================================
--- B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/complib/cl_timer.h   
            (revision 5977)
+++ B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/complib/cl_timer.h   
         (revision 5978)
@@ -45,8 +45,8 @@


 #include <complib/cl_types.h>
+#include <complib/cl_spinlock.h>

-
 /****h* Component Library/Timer
 * NAME
 *            Timer
Index: 
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/user/complib/cl_timer_osd.h
===================================================================
--- 
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/user/complib/cl_timer_osd.h
          (revision 5977)
+++ 
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/user/complib/cl_timer_osd.h
       (revision 5978)
@@ -47,8 +47,9 @@
                const void                                                      
     *context;
                uint64_t                                                        
       timeout_time;
                DWORD                                                           
                     thread_id;
+             cl_spinlock_t                                      spinlock;

 } cl_timer_t;


-#endif // _CL_TIMER_OSD_H_
\ No newline at end of file
+#endif                // _CL_TIMER_OSD_H_

Attachment: timer.patch
Description: timer.patch

_______________________________________________
ofw mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw

Reply via email to