Tobias Burnus wrote:
BTW: when using in gomp_new_team:
nthreads = gomp_barrier_init (&team->barrier, nthreads);
the number of 'Success' largely increased for the omptests, only showing
the following fails:
t-reduction/test.c:535: Failed
t-reduction/test.c:568: Failed at 0 with OUT[0] + num_tests[0],
expected 2156, got 483
t-reduction/test.c:578: Failed
t-reduction/test.c:611: Failed at 0 with OUT[0] + num_tests[0],
expected 2156, got 483
The issue with those testcases is that they expect that 32 threads are
running – but with GCN offloading, only 16 are used.
See attached reduced testcase for the second failing test; the first one
seems is the same but uses nested parallel instead of simd, which seems
to lead to 'GCN team arena exhausted; configure with
GCN_TEAM_ARENA_SIZE=bytes'.
* * *
Note: With host fallback, the expected 32 threads are created and the
test passes; for nvptx, 24 and (as written) for gcn 16 threads are used.
Thus, the result is somewhat consistent, even though it feels wrong that
32 threads won't get used – the number isn't that high.
I think GCC is still in the permitted behavior as the the code didn't
use 'num_threads(strict: 32)' – nor could it as omptests predates the
prescriptive modifier 'strict' (which is not yet implemented in GCC,
either).
Tobias
#include <stdio.h>
#include <omp.h>
#define N (957*3)
int main(void)
{
double Ad[N], Bd[N], Cd[N];
long long OUT[6];
double RESULT[1024];
int VALID[1024];
for (int i = 0; i < N; i++) {
Ad[i] = 1 << 16;
Bd[i] = i << 16;
Cd[i] = -(i << 16);
}
//
// Test: reduction on nested simd.
//
OUT[0] = 0;
int num_threads = 32;
int num_tests[1]; num_tests[0] = 0;
int trial = 0;
#pragma omp target teams num_teams(1) thread_limit(1024)
#pragma omp parallel num_threads(num_threads)
{
__builtin_printf("%d of %d\n", omp_get_thread_num(), omp_get_num_threads());
for (int offset = 0; offset < 32; offset++) {
for (int factor = 1; factor < 33; factor++) {
double Rd1 = 0; double Rd2 = 0;
int tid = omp_get_thread_num();
int lid = tid % 32;
VALID[tid] = 0;
if (lid >= offset && lid % factor == 0) {
#pragma omp simd reduction(+:Rd1) reduction(-:Rd2)
for (int i = 0; i < N; i++) {
Rd1 += Ad[i] + (Bd[i] + Cd[i]);
Rd2 += Ad[i] + (Bd[i] + Cd[i]);
}
VALID[tid] = 1;
RESULT[tid] = Rd1 + Rd2;
}
#pragma omp barrier
if (tid == 0) {
for (int i = 0; i < num_threads; i++) {
if (VALID[i]) num_tests[0]++;
if (VALID[i] && (RESULT[i] - (double) ((2*N) << 16) > .001)) {
OUT[0] = 1;
printf ("Failed nested simd reduction\n");
}
}
}
#pragma omp barrier
}
}
}
int errorNum = 0;
for (int i = 0; i < 1; i++) {
if (OUT[0] + num_tests[0] != 0+(trial+1)*2156) {
printf ("%s:%d: Failed at %3d with %s, expected %lld, got %lld\n",
__FILE__, __LINE__, i, "OUT[0] + num_tests[0]",
(long long) 0+(trial+1)*2156, (long long) OUT[0] + num_tests[0]);
errorNum++;
}
}
return errorNum;
}