Hi,

  opal_atomic_lifo implementation suffers from ABA problem.
Here is the code for opal_atomic_lifo_pop:

1     do {
2         item = lifo->opal_lifo_head;
3         if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
4                                     item,
5                                     (void*)item->opal_list_next ) )
6             break;
7         /* Do some kind of pause to release the bus */
8     } while( 1 );
9     if( item == &(lifo->opal_lifo_ghost) ) return NULL;
10    item->opal_list_next = NULL;
11    return item;

If the following happens:

   Thread1:                                           Thread2:
1 executes 2
2                                                   executes 1-11 and acquire 
item
3 enters 3 but preempted before cmpxchg
  NOTE: the third parameter passed to cmpset is
  NULL because item is in use by thread2
4                                                   executes lifo_push(item)
5 successfully executes cmpxchg since the old
  value is equal to current value (ABA problem)
  but places NULL into lifo->opal_lifo_head!

Included patch seems to be fixing this problem, but I am not really sure
if this is the right whay to solve this kind of problems.


diff --git a/opal/class/opal_atomic_lifo.h b/opal/class/opal_atomic_lifo.h
index caf35b1..4e8148c 100644
--- a/opal/class/opal_atomic_lifo.h
+++ b/opal/class/opal_atomic_lifo.h
@@ -71,8 +71,10 @@ static inline opal_list_item_t* opal_atomic_lifo_push( 
opal_atomic_lifo_t* lifo,
         item->opal_list_next = lifo->opal_lifo_head;
         if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
                                     (void*)item->opal_list_next,
-                                    item ) )
+                                    item ) ) {
+            opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0);
             return (opal_list_item_t*)item->opal_list_next;
+        }
         /* DO some kind of pause to release the bus */
     } while( 1 );
 #else
@@ -89,14 +91,17 @@ static inline opal_list_item_t* opal_atomic_lifo_pop( 
opal_atomic_lifo_t* lifo )
 {
     opal_list_item_t* item;
 #if OMPI_HAVE_THREAD_SUPPORT
-    do {
-        item = lifo->opal_lifo_head;
+    while((item = lifo->opal_lifo_head) != &(lifo->opal_lifo_ghost))
+    {
+        if(!opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 0, 1))
+            continue;
         if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
                                     item,
                                     (void*)item->opal_list_next ) )
             break;
+        opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0);
         /* Do some kind of pause to release the bus */
-    } while( 1 );
+    } 
 #else
     item = lifo->opal_lifo_head;
     lifo->opal_lifo_head = (opal_list_item_t*)item->opal_list_next;
diff --git a/opal/class/opal_list.c b/opal/class/opal_list.c
index c8a5568..715715e 100644
--- a/opal/class/opal_list.c
+++ b/opal/class/opal_list.c
@@ -55,6 +55,7 @@ OBJ_CLASS_INSTANCE(
 static void opal_list_item_construct(opal_list_item_t *item)
 {
     item->opal_list_next = item->opal_list_prev = NULL;
+    item->item_free = 1;
 #if OMPI_ENABLE_DEBUG
     item->opal_list_item_refcount = 0;
     item->opal_list_item_belong_to = NULL;
diff --git a/opal/class/opal_list.h b/opal/class/opal_list.h
index 83fa57b..3a45f4e 100644
--- a/opal/class/opal_list.h
+++ b/opal/class/opal_list.h
@@ -102,6 +102,7 @@ struct opal_list_item_t
     /**< Pointer to next list item */
     volatile struct opal_list_item_t *opal_list_prev;
     /**< Pointer to previous list item */
+    int32_t item_free;

 #if OMPI_ENABLE_DEBUG
     /** Atomic reference count for debugging */
--
                        Gleb.

Reply via email to