https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102789
Jakub Jelinek <jakub at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Ever confirmed|0 |1 Component|libgomp |target Status|UNCONFIRMED |NEW Last reconfirmed| |2021-10-18 CC| |dje at gcc dot gnu.org, | |segher at gcc dot gnu.org --- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Ok, I can reproduce, but only with -mcpu=power7. The cost model patch just uncovered a latent power7 vectorization bug (backend or vectorizer) I'd say. I've instrumented the testcase a little bit: // { dg-do run } // { dg-additional-options "-msse2" { target sse2_runtime } } // { dg-additional-options "-mavx" { target avx_runtime } } extern "C" void abort (); int a[1024] __attribute__((aligned (32))) = { 1 }; int b[1024] __attribute__((aligned (32))) = { 1 }; unsigned char c[1024] __attribute__((aligned (32))) = { 1 }; int k, m; __UINTPTR_TYPE__ u, u2, u3; __attribute__((noinline, noclone)) int foo (int *p) { int i, s = 0, s2 = 0, t, t2; #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s) \ lastprivate (t2) for (i = 0; i < 512; i++) { a[i] *= p[i]; t2 = k + p[i]; k += m + 1; s += p[i] + k; c[i]++; } #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \ lastprivate (t, u, u2, u3) for (i = 512; i < 1024; i++) { a[i] *= p[i]; k += m + 1; t = k + p[i]; u = (__UINTPTR_TYPE__) &k; u2 = (__UINTPTR_TYPE__) &s2; u3 = (__UINTPTR_TYPE__) &t; s2 += t; c[i]++; } __builtin_printf ("foo %d %d %d %d\n", s, s2, t, t2); return s + s2 + t + t2; } __attribute__((noinline, noclone)) long int bar (int *p, long int n, long int o) { long int i, s = 0, s2 = 0, t, t2; #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s) \ lastprivate (t2) for (i = 0; i < n; i++) { a[i] *= p[i]; t2 = k + p[i]; k += m + 1; s += p[i] + k; c[i]++; } #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \ lastprivate (t, u, u2, u3) for (i = n; i < o; i++) { a[i] *= p[i]; k += m + 1; t = k + p[i]; u = (__UINTPTR_TYPE__) &k; u2 = (__UINTPTR_TYPE__) &s2; u3 = (__UINTPTR_TYPE__) &t; s2 += t; c[i]++; } __builtin_printf ("bar %d %d %d %d\n", s, s2, t, t2); return s + s2 + t + t2; } int main () { #if __SIZEOF_INT__ >= 4 int i; k = 4; m = 2; for (i = 0; i < 1024; i++) { a[i] = i - 512; b[i] = (i - 51) % 39; c[i] = (unsigned char) i; } int s = foo (b); for (i = 0; i < 1024; i++) { if (b[i] != (i - 51) % 39 || a[i] != (i - 512) * b[i] || c[i] != (unsigned char) (i + 1)) { __builtin_printf ("#1 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } a[i] = i - 512; } if (k != 4 + 3 * 1024 || s != 1596127 + (4 + 3 * 511 + b[511]) + (4 + 3 * 1024 + b[1023])) { __builtin_printf ("#2 %d %d\n", k, s); abort (); } k = 4; s = bar (b, 512, 1024); for (i = 0; i < 1024; i++) { if (b[i] != (i - 51) % 39 || a[i] != (i - 512) * b[i] || c[i] != (unsigned char) (i + 2)) { __builtin_printf ("#3 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } a[i] = i - 512; } if (k != 4 + 3 * 1024 || s != 1596127 + (4 + 3 * 511 + b[511]) + (4 + 3 * 1024 + b[1023])) { __builtin_printf ("#4 %d %d\n", k, s); abort (); } k = 4; s = bar (b, 511, 1021); for (i = 0; i < 1021; i++) { if (b[i] != (i - 51) % 39 || a[i] != (i - 512) * b[i] || c[i] != (unsigned char) (i + 3)) { __builtin_printf ("#5 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } a[i] = i - 512; } for (i = 1021; i < 1024; i++) if (b[i] != (i - 51) % 39 || a[i] != i - 512 || c[i] != (unsigned char) (i + 2)) { __builtin_printf ("#6 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } if (k != 4 + 3 * 1021 || s != 1586803 + (4 + 3 * 510 + b[510]) + (4 + 3 * 1021 + b[1020])) { __builtin_printf ("#7 %d %d %d %d\n", k, s, b[510], b[1020]); abort (); } #endif return 0; } When compiled with -O2 -m32 -fopenmp -mcpu=power6, this prints: foo 403860 1192267 3112 1568 bar 403860 1192267 3112 1568 bar 402289 1184514 3100 1564 while with -O2 -m32 -fopenmp -mcpu=power7 it prints: foo 403860 1192267 3112 1568 bar 403860 1192267 3112 1568 bar 402289 919217 3100 1564 #7 3067 1326170 30 33 Aborted which seems to suggest it is the: #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \ lastprivate (t, u, u2, u3) for (i = n; i < o; i++) { a[i] *= p[i]; k += m + 1; t = k + p[i]; u = (__UINTPTR_TYPE__) &k; u2 = (__UINTPTR_TYPE__) &s2; u3 = (__UINTPTR_TYPE__) &t; s2 += t; c[i]++; } loop that is miscompiled and miscomputes s2. Now, the loop is invoked twice, once with n = 512 and o = 1024 and in that case it works fine, and then with n = 511 and o = 1021 and in that case it misbehaves, so I bet it must be related to the prologue or epilogue loops. Now, if I compile with -O2 -m32 -fopenmp -mcpu=power7 -fvect-cost-model=unlimited -fsimd-cost-model=unlimited, it is miscompiled the same way already in r9-1520-g42c5d1212ff6544be1061d488aa7ebee9463c375 (haven't bisected fully), but certainly r5-370-ged15c5984e10f6556dffdf397accff804bf60a7c through r9-1052-gfa725532c41ae543fd0078263ea348aa5af3997d have been ICEing on it instead: simd-3.C: In function ‘long int bar(int*, long int, long int)’: simd-3.C:44:1: internal compiler error: in vect_get_store_cost, at tree-vect-stmts.c:1123 bar (int *p, long int n, long int o) ^~~ 0x1510f87 vect_get_store_cost(_stmt_vec_info*, int, unsigned int*, vec<stmt_info_for_cost, va_heap, vl_ptr>*) ../../gcc/tree-vect-stmts.c:1123 0x1510da0 vect_model_store_cost ../../gcc/tree-vect-stmts.c:1057 0x152200b vectorizable_store ../../gcc/tree-vect-stmts.c:6396 0x152cbf2 vect_analyze_stmt(gimple*, bool*, _slp_tree*, _slp_instance*, vec<stmt_info_for_cost, va_heap, vl_ptr>*) ../../gcc/tree-vect-stmts.c:9550 0x153a01f vect_analyze_loop_operations ../../gcc/tree-vect-loop.c:1655 0x153ad28 vect_analyze_loop_2 ../../gcc/tree-vect-loop.c:2050 0x153bd78 vect_analyze_loop(loop*, _loop_vec_info*) ../../gcc/tree-vect-loop.c:2343 0x157062a vectorize_loops() ../../gcc/tree-vectorizer.c:758 0x14196b5 execute ../../gcc/tree-ssa-loop.c:414 Note, r5-370 would ICE with it even with just -O3 -fopenmp -m32 -mcpu=power7 or -O2 -fopenmp -m32 -mcpu=power7 -fvect-cost-model=unlimited.