https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102789

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
          Component|libgomp                     |target
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2021-10-18
                 CC|                            |dje at gcc dot gnu.org,
                   |                            |segher at gcc dot gnu.org

--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Ok, I can reproduce, but only with -mcpu=power7.  The cost model patch just
uncovered a latent power7 vectorization bug (backend or vectorizer) I'd say.
I've instrumented the testcase a little bit:
// { dg-do run }
// { dg-additional-options "-msse2" { target sse2_runtime } }
// { dg-additional-options "-mavx" { target avx_runtime } }

extern "C" void abort ();
int a[1024] __attribute__((aligned (32))) = { 1 };
int b[1024] __attribute__((aligned (32))) = { 1 };
unsigned char c[1024] __attribute__((aligned (32))) = { 1 };
int k, m;
__UINTPTR_TYPE__ u, u2, u3;

__attribute__((noinline, noclone)) int
foo (int *p)
{
  int i, s = 0, s2 = 0, t, t2;
  #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s) \
                   lastprivate (t2)
  for (i = 0; i < 512; i++)
    {
      a[i] *= p[i];
      t2 = k + p[i];
      k += m + 1;
      s += p[i] + k;
      c[i]++;
    }
  #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \
                   lastprivate (t, u, u2, u3)
  for (i = 512; i < 1024; i++)
    {
      a[i] *= p[i];
      k += m + 1;
      t = k + p[i];
      u = (__UINTPTR_TYPE__) &k;
      u2 = (__UINTPTR_TYPE__) &s2;
      u3 = (__UINTPTR_TYPE__) &t;
      s2 += t;
      c[i]++;
    }
__builtin_printf ("foo %d %d %d %d\n", s, s2, t, t2);
  return s + s2 + t + t2;
}

__attribute__((noinline, noclone)) long int
bar (int *p, long int n, long int o)
{
  long int i, s = 0, s2 = 0, t, t2;
  #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s) \
                   lastprivate (t2)
  for (i = 0; i < n; i++)
    {
      a[i] *= p[i];
      t2 = k + p[i];
      k += m + 1;
      s += p[i] + k;
      c[i]++;
    }
  #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \
                   lastprivate (t, u, u2, u3)
  for (i = n; i < o; i++)
    {
      a[i] *= p[i];
      k += m + 1;
      t = k + p[i];
      u = (__UINTPTR_TYPE__) &k;
      u2 = (__UINTPTR_TYPE__) &s2;
      u3 = (__UINTPTR_TYPE__) &t;
      s2 += t;
      c[i]++;
    }
__builtin_printf ("bar %d %d %d %d\n", s, s2, t, t2);
  return s + s2 + t + t2;
}

int
main ()
{
#if __SIZEOF_INT__ >= 4
  int i;
  k = 4;
  m = 2;
  for (i = 0; i < 1024; i++)
    {
      a[i] = i - 512;
      b[i] = (i - 51) % 39;
      c[i] = (unsigned char) i;
    }
  int s = foo (b);
  for (i = 0; i < 1024; i++)
    {
      if (b[i] != (i - 51) % 39
          || a[i] != (i - 512) * b[i]
          || c[i] != (unsigned char) (i + 1))
{
__builtin_printf ("#1 %d %d %d %d\n", i, b[i], a[i], c[i]);
        abort ();
}
      a[i] = i - 512;
    }
  if (k != 4 + 3 * 1024
      || s != 1596127 + (4 + 3 * 511 + b[511]) + (4 + 3 * 1024 + b[1023]))
{
__builtin_printf ("#2 %d %d\n", k, s);
    abort ();
}
  k = 4;
  s = bar (b, 512, 1024);
  for (i = 0; i < 1024; i++)
    {
      if (b[i] != (i - 51) % 39
          || a[i] != (i - 512) * b[i]
          || c[i] != (unsigned char) (i + 2))
{
__builtin_printf ("#3 %d %d %d %d\n", i, b[i], a[i], c[i]);
        abort ();
}
      a[i] = i - 512;
    }
  if (k != 4 + 3 * 1024
      || s != 1596127 + (4 + 3 * 511 + b[511]) + (4 + 3 * 1024 + b[1023]))
{
__builtin_printf ("#4 %d %d\n", k, s);
    abort ();
}
  k = 4;
  s = bar (b, 511, 1021);
  for (i = 0; i < 1021; i++)
    {
      if (b[i] != (i - 51) % 39
          || a[i] != (i - 512) * b[i]
          || c[i] != (unsigned char) (i + 3))
{
__builtin_printf ("#5 %d %d %d %d\n", i, b[i], a[i], c[i]);
        abort ();
}
      a[i] = i - 512;
    }
  for (i = 1021; i < 1024; i++)
    if (b[i] != (i - 51) % 39
        || a[i] != i - 512
        || c[i] != (unsigned char) (i + 2))
{
__builtin_printf ("#6 %d %d %d %d\n", i, b[i], a[i], c[i]);
      abort ();
}
  if (k != 4 + 3 * 1021
      || s != 1586803 + (4 + 3 * 510 + b[510]) + (4 + 3 * 1021 + b[1020]))
{
__builtin_printf ("#7 %d %d %d %d\n", k, s, b[510], b[1020]);
    abort ();
}
#endif
  return 0;
}

When compiled with -O2 -m32 -fopenmp -mcpu=power6, this prints:
foo 403860 1192267 3112 1568
bar 403860 1192267 3112 1568
bar 402289 1184514 3100 1564
while with -O2 -m32 -fopenmp -mcpu=power7 it prints:
foo 403860 1192267 3112 1568
bar 403860 1192267 3112 1568
bar 402289 919217 3100 1564
#7 3067 1326170 30 33
Aborted
which seems to suggest it is the:
  #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \
                   lastprivate (t, u, u2, u3)
  for (i = n; i < o; i++)
    {
      a[i] *= p[i];
      k += m + 1;
      t = k + p[i];
      u = (__UINTPTR_TYPE__) &k;
      u2 = (__UINTPTR_TYPE__) &s2;
      u3 = (__UINTPTR_TYPE__) &t;
      s2 += t;
      c[i]++;
    }
loop that is miscompiled and miscomputes s2.
Now, the loop is invoked twice, once with n = 512 and o = 1024 and in that case
it works fine, and then with n = 511 and o = 1021 and in that case it
misbehaves, so I bet it must be related to the prologue or epilogue loops.

Now, if I compile with -O2 -m32 -fopenmp -mcpu=power7
-fvect-cost-model=unlimited -fsimd-cost-model=unlimited, it is miscompiled the
same way already in r9-1520-g42c5d1212ff6544be1061d488aa7ebee9463c375 (haven't
bisected fully), but certainly r5-370-ged15c5984e10f6556dffdf397accff804bf60a7c
through
r9-1052-gfa725532c41ae543fd0078263ea348aa5af3997d have been ICEing on it
instead:
simd-3.C: In function ‘long int bar(int*, long int, long int)’:
simd-3.C:44:1: internal compiler error: in vect_get_store_cost, at
tree-vect-stmts.c:1123
 bar (int *p, long int n, long int o)
 ^~~
0x1510f87 vect_get_store_cost(_stmt_vec_info*, int, unsigned int*,
vec<stmt_info_for_cost, va_heap, vl_ptr>*)
        ../../gcc/tree-vect-stmts.c:1123
0x1510da0 vect_model_store_cost
        ../../gcc/tree-vect-stmts.c:1057
0x152200b vectorizable_store
        ../../gcc/tree-vect-stmts.c:6396
0x152cbf2 vect_analyze_stmt(gimple*, bool*, _slp_tree*, _slp_instance*,
vec<stmt_info_for_cost, va_heap, vl_ptr>*)
        ../../gcc/tree-vect-stmts.c:9550
0x153a01f vect_analyze_loop_operations
        ../../gcc/tree-vect-loop.c:1655
0x153ad28 vect_analyze_loop_2
        ../../gcc/tree-vect-loop.c:2050
0x153bd78 vect_analyze_loop(loop*, _loop_vec_info*)
        ../../gcc/tree-vect-loop.c:2343
0x157062a vectorize_loops()
        ../../gcc/tree-vectorizer.c:758
0x14196b5 execute
        ../../gcc/tree-ssa-loop.c:414
Note, r5-370 would ICE with it even with just
-O3 -fopenmp -m32 -mcpu=power7 or -O2 -fopenmp -m32 -mcpu=power7
-fvect-cost-model=unlimited.

Reply via email to