This patch cures the remaining libgomp failures I see on power7. I'll admit to approaching this fix with a big hammer at first, liberally using MEMMODEL_SEQ_CST barriers all over bar.h and bar.c, then gradually reducing the number of places and strictness of the barriers. This may still have a few unnecessary barriers, but I'm not confident in removing any more.
What led me to believe that barriers were missing in the current code was the nature of the test failures in the first place, and that the existing code uses atomic_write_barrier in two places between writing "awaited" and "generation", but does not have corresponding read barriers. Without read barriers processors are allowed to speculatively read "generation" ahead of time, leading to use of a stale value. PR libgomp/51298 * config/linux/bar.h: Use atomic rather than sync builtins. * config/linux/bar.c: Likewise. Add missing acquire and release synchronisation on generation field. * task.c (gomp_barrier_handle_tasks): Regain lock so as to not double unlock. Index: libgomp/task.c =================================================================== --- libgomp/task.c (revision 181718) +++ libgomp/task.c (working copy) @@ -273,6 +273,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ gomp_team_barrier_done (&team->barrier, state); gomp_mutex_unlock (&team->task_lock); gomp_team_barrier_wake (&team->barrier, 0); + gomp_mutex_lock (&team->task_lock); } } } Index: libgomp/config/linux/bar.h =================================================================== --- libgomp/config/linux/bar.h (revision 181718) +++ libgomp/config/linux/bar.h (working copy) @@ -50,7 +50,7 @@ static inline void gomp_barrier_init (go static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count) { - __sync_fetch_and_add (&bar->awaited, count - bar->total); + __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL); bar->total = count; } @@ -69,10 +69,8 @@ extern void gomp_team_barrier_wake (gomp static inline gomp_barrier_state_t gomp_barrier_wait_start (gomp_barrier_t *bar) { - unsigned int ret = bar->generation & ~3; - /* Do we need any barrier here or is __sync_add_and_fetch acting - as the needed LoadLoad barrier already? */ - ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; + unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3; + ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; return ret; } Index: libgomp/config/linux/bar.c =================================================================== --- libgomp/config/linux/bar.c (revision 181718) +++ libgomp/config/linux/bar.c (working copy) @@ -36,18 +36,15 @@ gomp_barrier_wait_end (gomp_barrier_t *b if (__builtin_expect ((state & 1) != 0, 0)) { /* Next time we'll be awaiting TOTAL threads again. */ - bar->awaited = bar->total; - atomic_write_barrier (); - bar->generation += 4; + __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE); + __atomic_add_fetch (&bar->generation, 4, MEMMODEL_ACQ_REL); futex_wake ((int *) &bar->generation, INT_MAX); } else { - unsigned int generation = state; - do - do_wait ((int *) &bar->generation, generation); - while (bar->generation == generation); + do_wait ((int *) &bar->generation, state); + while (__atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) == state); } } @@ -81,15 +78,15 @@ gomp_team_barrier_wake (gomp_barrier_t * void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) { - unsigned int generation; + unsigned int generation, gen; if (__builtin_expect ((state & 1) != 0, 0)) { /* Next time we'll be awaiting TOTAL threads again. */ struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; - bar->awaited = bar->total; - atomic_write_barrier (); + + __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE); if (__builtin_expect (team->task_count, 0)) { gomp_barrier_handle_tasks (state); @@ -97,7 +94,7 @@ gomp_team_barrier_wait_end (gomp_barrier } else { - bar->generation = state + 3; + __atomic_store_4 (&bar->generation, state + 3, MEMMODEL_RELEASE); futex_wake ((int *) &bar->generation, INT_MAX); return; } @@ -107,12 +104,14 @@ gomp_team_barrier_wait_end (gomp_barrier do { do_wait ((int *) &bar->generation, generation); - if (__builtin_expect (bar->generation & 1, 0)) + gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE); + if (__builtin_expect (gen & 1, 0)) gomp_barrier_handle_tasks (state); - if ((bar->generation & 2)) + gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE); + if ((gen & 2) != 0) generation |= 2; } - while (bar->generation != state + 4); + while (gen != state + 4); } void -- Alan Modra Australia Development Lab, IBM