Hi! Guided scheduling doesn't ensure that all assigned chunks have sizes of multiple of chunk_size, chunk_size for guided scheduling actually means just minimum size of the chunk. That means we can't divide the IV or number of iterations by chunk size though, because the precondition for that is that all chunks assigned to each thread are multiples of the chunk size.
Fixed thusly, tested on x86_64-linux, committed to gomp-4_5-branch. 2015-11-13 Jakub Jelinek <ja...@redhat.com> * ordered.c (gomp_doacross_init, GOMP_doacross_post, GOMP_doacross_wait, gomp_doacross_ull_init, GOMP_doacross_ull_post, GOMP_doacross_ull_wait): For GFS_GUIDED don't divide number of iterators or IV by chunk size. * testsuite/libgomp.c/doacross-3.c: New test. --- libgomp/ordered.c.jj 2015-10-14 10:24:10.000000000 +0200 +++ libgomp/ordered.c 2015-11-13 19:14:11.877005602 +0100 @@ -297,6 +297,8 @@ gomp_doacross_init (unsigned ncounts, lo if (ws->sched == GFS_STATIC) num_ents = team->nthreads; + else if (ws->sched == GFS_GUIDED) + num_ents = counts[0]; else num_ents = (counts[0] - 1) / chunk_size + 1; if (num_bits <= MAX_COLLAPSED_BITS) @@ -366,6 +368,8 @@ GOMP_doacross_post (long *counts) if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ent = thr->ts.team_id; + else if (ws->sched == GFS_GUIDED) + ent = counts[0]; else ent = counts[0] / doacross->chunk_size; unsigned long *array = (unsigned long *) (doacross->array @@ -426,6 +430,8 @@ GOMP_doacross_wait (long first, ...) else ent = first / ws->chunk_size % thr->ts.team->nthreads; } + else if (ws->sched == GFS_GUIDED) + ent = first; else ent = first / doacross->chunk_size; unsigned long *array = (unsigned long *) (doacross->array @@ -520,6 +526,8 @@ gomp_doacross_ull_init (unsigned ncounts if (ws->sched == GFS_STATIC) num_ents = team->nthreads; + else if (ws->sched == GFS_GUIDED) + num_ents = counts[0]; else num_ents = (counts[0] - 1) / chunk_size + 1; if (num_bits <= MAX_COLLAPSED_BITS) @@ -595,6 +603,8 @@ GOMP_doacross_ull_post (gomp_ull *counts if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ent = thr->ts.team_id; + else if (ws->sched == GFS_GUIDED) + ent = counts[0]; else ent = counts[0] / doacross->chunk_size_ull; @@ -676,6 +686,8 @@ GOMP_doacross_ull_wait (gomp_ull first, else ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; } + else if (ws->sched == GFS_GUIDED) + ent = first; else ent = first / doacross->chunk_size_ull; --- libgomp/testsuite/libgomp.c/doacross-3.c.jj 2015-11-13 19:08:22.191960410 +0100 +++ libgomp/testsuite/libgomp.c/doacross-3.c 2015-11-13 19:09:01.626401650 +0100 @@ -0,0 +1,225 @@ +extern void abort (void); + +#define N 256 +int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6]; +volatile int d, e; +volatile unsigned long long f; + +int +main () +{ + unsigned long long i; + int j, k, l, m; + #pragma omp parallel private (l) + { + #pragma omp for schedule(guided, 3) ordered (1) nowait + for (i = 1; i < N + f; i++) + { + #pragma omp atomic write + a[i] = 1; + #pragma omp ordered depend(sink: i - 1) + if (i > 1) + { + #pragma omp atomic read + l = a[i - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + a[i] = 2; + if (i < N - 1) + { + #pragma omp atomic read + l = a[i + 1]; + if (l == 3) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + a[i] = 3; + } + #pragma omp for schedule(guided) ordered (3) nowait + for (i = 3; i < N / 16 - 1 + f; i++) + for (j = 0; j < 8; j += 2) + for (k = 1; k <= 3; k++) + { + #pragma omp atomic write + b[i][j][k] = 1; + #pragma omp ordered depend(sink: i, j - 2, k - 1) \ + depend(sink: i - 2, j - 2, k + 1) + #pragma omp ordered depend(sink: i - 3, j + 2, k - 2) + if (j >= 2 && k > 1) + { + #pragma omp atomic read + l = b[i][j - 2][k - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + b[i][j][k] = 2; + if (i >= 5 && j >= 2 && k < 3) + { + #pragma omp atomic read + l = b[i - 2][j - 2][k + 1]; + if (l < 2) + abort (); + } + if (i >= 6 && j < N / 16 - 3 && k == 3) + { + #pragma omp atomic read + l = b[i - 3][j + 2][k - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + b[i][j][k] = 3; + } +#define A(n) int n; +#define B(n) A(n##0) A(n##1) A(n##2) A(n##3) +#define C(n) B(n##0) B(n##1) B(n##2) B(n##3) +#define D(n) C(n##0) C(n##1) C(n##2) C(n##3) + D(m) +#undef A + #pragma omp for collapse (2) ordered(61) schedule(guided, 15) + for (i = 2; i < N / 32 + f; i++) + for (j = 7; j > 1; j--) + for (k = 6; k >= 0; k -= 2) +#define A(n) for (n = 4; n < 5; n++) + D(m) +#undef A + { + #pragma omp atomic write + c[i][j][k] = 1; +#define A(n) ,n +#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321) + #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \ + depend (sink:i - 2, j + 1, k - 4 E(m)) \ + depend(sink: i - 1, j - 2, k - 2 E(m)) + if (k <= 4) + { + l = c[i][j][k + 2]; + if (l < 2) + abort (); + } + #pragma omp atomic write + c[i][j][k] = 2; + if (i >= 4 && j < 7 && k >= 4) + { + l = c[i - 2][j + 1][k - 4]; + if (l < 2) + abort (); + } + if (i >= 3 && j >= 4 && k >= 2) + { + l = c[i - 1][j - 2][k - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend (source) + #pragma omp atomic write + c[i][j][k] = 3; + } + #pragma omp for schedule(guided, 5) ordered (3) nowait + for (j = 0; j < N / 16 - 1; j++) + for (k = 0; k < 8; k += 2) + for (i = 3; i <= 5 + f; i++) + { + #pragma omp atomic write + g[j][k][i] = 1; + #pragma omp ordered depend(sink: j, k - 2, i - 1) \ + depend(sink: j - 2, k - 2, i + 1) + #pragma omp ordered depend(sink: j - 3, k + 2, i - 2) + if (k >= 2 && i > 3) + { + #pragma omp atomic read + l = g[j][k - 2][i - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + g[j][k][i] = 2; + if (j >= 2 && k >= 2 && i < 5) + { + #pragma omp atomic read + l = g[j - 2][k - 2][i + 1]; + if (l < 2) + abort (); + } + if (j >= 3 && k < N / 16 - 3 && i == 5) + { + #pragma omp atomic read + l = g[j - 3][k + 2][i - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + g[j][k][i] = 3; + } + #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k) + for (i = 2; i < f + 3; i++) + for (j = d + 1; j >= 0; j--) + for (k = 0; k < d; k++) + for (l = 0; l < d + 2; l++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l) + if (!e) + abort (); + } + #pragma omp single + { + if (i != 3 || j != -1 || k != 0) + abort (); + i = 8; j = 9; k = 10; + } + #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m) + for (i = 2; i < f + 3; i++) + for (j = d + 1; j >= 0; j--) + for (k = 0; k < d + 2; k++) + for (m = 0; m < d; m++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m) + abort (); + } + #pragma omp single + if (i != 3 || j != -1 || k != 2 || m != 0) + abort (); + #pragma omp for collapse(2) ordered(4) nowait + for (i = 2; i < f + 3; i++) + for (j = d; j > 0; j--) + for (k = 0; k < d + 2; k++) + for (l = 0; l < d + 4; l++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l) + if (!e) + abort (); + } + #pragma omp for nowait + for (i = 0; i < N; i++) + if (a[i] != 3 * (i >= 1)) + abort (); + #pragma omp for collapse(2) private(k) nowait + for (i = 0; i < N / 16; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 4; k++) + if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1)) + abort (); + #pragma omp for collapse(3) nowait + for (i = 0; i < N / 32; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 8; k++) + if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0)) + abort (); + #pragma omp for collapse(2) private(k) nowait + for (i = 0; i < N / 16; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 6; k++) + if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3)) + abort (); + } + return 0; +} Jakub