[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-27 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

Richard Biener  changed:

   What|Removed |Added

   Priority|P3  |P2

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-21 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #13 from Richard Biener  ---
Testcase with just the essential stuff.

static int g_1731[7] = { 42, 0, 0, 0, 0, 0, 42 };

void __attribute__((noipa)) foo ()
{
  int l_1930[5] = { 0, };

  for (int i = 0; i < 15; ++i)
for (int j = 4; (j >= 1); j -= 1)
#pragma GCC unroll 0
  for (int k = 0; (k <= 4); k += 1)
g_1731[(j + 1)] = --l_1930[k];
}

int main()
{
  foo ();
  if (g_1731[0] != 42
  || g_1731[1] != 0 || g_1731[2] != -60 || g_1731[3] != -59
  || g_1731[4] != -58 || g_1731[5] != -57
  || g_1731[6] != 42)
__builtin_abort ();
  return 0;
}


The innermost loop body then is

   [local count: 894749066]:
  # k_26 = PHI 
  # ivtmp_23 = PHI 
  _1 = l_1930[k_26];
  _2 = _1 + -1;
  l_1930[k_26] = _2;
  g_1731[_6] = _2;
  k_17 = k_26 + 1;
  ivtmp_21 = ivtmp_23 - 1;
  if (ivtmp_21 != 0)

one should note that for data dependence analysis we'd usually need to
treat scalars (in this case SSA names) as arrays of the size of the
whole nest iteration domain and the dependences would be between
statements, not reads/writes.  So the above is

  _1 = l_1930[k_26];
  _2[i] = _1 + -1;
  l_1930[k_26] = _2[i];
  g_1731[_6] = _2[i];

then and when we interchange the loop we suddenly need two different
_2[] elements and when eliminating _2[] there's a dependence between
the l_1930 store and the implied load from a different iteration.

Note that when l_1930[k] wouldn't be stored to g_1731[j+1] the
interchange would be of course valid and we do not want to break
that case.

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread acoplan at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

Alex Coplan  changed:

   What|Removed |Added

 CC||acoplan at gcc dot gnu.org

--- Comment #12 from Alex Coplan  ---
The original testcase with -fsanitize=undefined shows some "load of misaligned
address" errors at runtime, which suggests it is invalid.

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #11 from Richard Biener  ---
Fails even with -O2 -floop-interchange -fno-move-loop-stores (otherwise we
complicate the IL by applying store-motion to g_1731).

(compute_affine_dependence
  ref_a: l_1930[k_33], stmt_a: _1 = l_1930[k_33];
  ref_b: g_1731[_51], stmt_b: g_1731[_51] = _5;
) -> no dependence
(compute_affine_dependence
  ref_a: l_1930[k_33], stmt_a: l_1930[k_33] = _2;
  ref_b: g_1731[_51], stmt_b: _4 = g_1731[_51];
) -> no dependence
(compute_affine_dependence
  ref_a: l_1930[k_33], stmt_a: l_1930[k_33] = _2;
  ref_b: g_1731[_51], stmt_b: g_1731[_51] = _5;
) -> no dependence

maybe I'm missing something but we seem to fail to honor dependences from
SSA edges?

   [local count: 894749066]:
  # k_33 = PHI 
  # ivtmp_41 = PHI 
  _1 = l_1930[k_33];
  _2 = _1 + 18446744073709551615;
  l_1930[k_33] = _2;
  _4 = g_1731[_51];
  _38 = _2 & _4;
  _5 = _38 & 38;
  g_1731[_51] = _5;
  k_24 = k_33 + 1;
  ivtmp_40 = ivtmp_41 - 1;
  if (ivtmp_40 != 0)

Of course DDR_ARE_DEPENDENT (ddr) == chrec_known just tells us there's no
memory dependece.

As said, maybe I'm missing something ...

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #10 from Richard Biener  ---
Somewhat more reduced:

typedef __UINT64_TYPE__ uint64_t;

static uint64_t g_1731[7] = {0xF75EE82FC4736923LL, 0, 0xF75EE82FC4736923LL, 0,
0xF75EE82FC4736923LL, 0, 0xF75EE82FC4736923LL};

void __attribute__((noipa)) foo ()
{
  uint64_t l_1930[5] = { 0x5e44d2fed3bca5f2, 0x5e44d2fed3bca5f2, 
  0x5e44d2fed3bca5f2, 0x5e44d2fed3bca5f2, 0x5e44d2fed3bca5f2 };

  for (int i = 0; i < 15; ++i)
for (int j = 4; (j >= 1); j -= 1)
#pragma GCC unroll 0
  for (int k = 0; (k <= 4); k += 1)
g_1731[(j + 1)] &= (0x26L & (--l_1930[k]));
}

int main()
{
  foo ();
  /* f75ee82fc4736923 0 2 0 0 0 f75ee82fc4736923 */
  if (g_1731[0] != 0xF75EE82FC4736923LL
  || g_1731[2] != 2
  || g_1731[4] != 0
  || g_1731[6] != 0xF75EE82FC4736923LL)
__builtin_abort ();
  return 0;
}

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread jakub at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #9 from Jakub Jelinek  ---
I've tried to deobfuscate the innermost loop's body:
int32_t l_1942 = (-3L);
int32_t *l_1947 = &l_1946[0][6];
int i, j;
uint64_t t1 = --l_1930[g_1179];
t1 += 0xFC07342370A5FE25ULL;
t1 &= 38;
uint64_t t2 = (g_1731[l_1719 + 1][l_1721] &= t1);
l_1942 ^= (t2 + 4ULL <= p_5.f0);
l_1943[0][1][1]++;
int32_t t3 = l_1946[0][6];
*l_1947 = t3;
l_1949 = t3 == g_1948;

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #8 from Richard Biener  ---
Reduced testcase:

typedef __UINT64_TYPE__ uint64_t;

static uint64_t g_1731[7][1] = {{0xF75EE82FC4736923LL},{0UL},
  {0xF75EE82FC4736923LL},{0UL},
  {0xF75EE82FC4736923LL},{0UL},
  {0xF75EE82FC4736923LL}};

static int g_149;
static unsigned short g_1179;

void __attribute__((noipa)) foo ()
{
  uint64_t l_1930[5];
  int l_1719, l_1721;

  int i;
  for (i = 0; i < 5; i++)
l_1930[i] = 0x623D9EDB6316A7CDLL;

  for (g_149 = 0; (g_149 > (-15)); g_149--)
for (l_1719 = 4; (l_1719 >= 1); l_1719 -= 1)
  for (l_1721 = 0; (l_1721 >= 0); l_1721 -= 1)
for (g_1179 = 0; (g_1179 <= 4); g_1179 += 1)
  g_1731[(l_1719 + 1)][l_1721]
&= ((0x26L & (((--l_1930[g_1179]) + 0xFC07342370A5FE25LL;
}

int main()
{
  foo ();
  /* f75ee82fc4736923 0 2 0 0 0 f75ee82fc4736923 */
  if (g_1731[0][0] != 0xF75EE82FC4736923LL
  || g_1731[2][0] != 2
  || g_1731[4][0] != 0
  || g_1731[6][0] != 0xF75EE82FC4736923LL)
__builtin_abort ();
  return 0;
}

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #7 from Richard Biener  ---
(In reply to Richard Biener from comment #6)
> Confirmed with -O2 -floop-interchange.  There's just a single interchange
> done:
> 
> runData/keep/in.713.c:648:32: optimized: loops interchanged in loop nest
> 
> that's in func_2 for the nest
> 
> for (g_149 = 0; (g_149 > (-15)); g_149--)
> {
> for (l_1719 = 4; (l_1719 >= 1); l_1719 -= 1)
> {
> for (l_1721 = 0; (l_1721 >= 0); l_1721 -= 1)
> {
> struct S1 l_1935 = {0x13186D76L,0xC9L,36,24,1L,0x87L};
> for (g_1179 = 0; (g_1179 <= 4); g_1179 += 1)
> {
> int32_t l_1942 = (-3L);
> int32_t *l_1947 = &l_1946[0][6];
> int i, j;
> l_1942 ^= ((safe_add_func_uint64_t_u_u((l_1935 ,
> (((l_1936[1] != (void*)0) < (*g_511)) & (g_1731[(l_1719 + 1)][l_1721] &=
> (((0x943C8AB0L | 0xE398A931L) != g_20) , (0x26L &
> (safe_add_func_uint64_t_u_u((--l_1930[g_1179]), 0xFC07342370A5FE25LL))),
> 4L)) <= p_5.f0);
> l_1943[0][1][1]++;
> l_1949 = (((*l_1947) = l_1946[0][6]) == g_1948);
> }
> }
> }
> }

And we are interchanging the outer two loops.  Since the outer loop IV isn't
used in the body it doesn't change anything data dep wise?

Interchanging the loops in the source reproduces the issue, so somehow
for the only use of l_1719

(g_1731[(l_1719 + 1)][l_1721] &= (((0x943C8AB0L | 0xE398A931L) != g_20) ,
(0x26L & (safe_add_func_uint64_t_u_u((--l_1930[g_1179]),
0xFC07342370A5FE25LL)

it makes a difference.  It's

  g_1731[(l_1719 + 1)][l_1721] &= val;

the order we & values into it shouldn't matter.  But it's so much obfuscated
code ...

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

--- Comment #6 from Richard Biener  ---
Confirmed with -O2 -floop-interchange.  There's just a single interchange done:

runData/keep/in.713.c:648:32: optimized: loops interchanged in loop nest

that's in func_2 for the nest

for (g_149 = 0; (g_149 > (-15)); g_149--)
{
for (l_1719 = 4; (l_1719 >= 1); l_1719 -= 1)
{
for (l_1721 = 0; (l_1721 >= 0); l_1721 -= 1)
{
struct S1 l_1935 = {0x13186D76L,0xC9L,36,24,1L,0x87L};
for (g_1179 = 0; (g_1179 <= 4); g_1179 += 1)
{
int32_t l_1942 = (-3L);
int32_t *l_1947 = &l_1946[0][6];
int i, j;
l_1942 ^= ((safe_add_func_uint64_t_u_u((l_1935 ,
(((l_1936[1] != (void*)0) < (*g_511)) & (g_1731[(l_1719 + 1)][l_1721] &=
(((0x943C8AB0L | 0xE398A931L) != g_20) , (0x26L &
(safe_add_func_uint64_t_u_u((--l_1930[g_1179]), 0xFC07342370A5FE25LL))),
4L)) <= p_5.f0);
l_1943[0][1][1]++;
l_1949 = (((*l_1947) = l_1946[0][6]) == g_1948);
}
}
}
}

[Bug tree-optimization/109184] [10/11/12/13 Regression] csmith: 2017 bug with -floop-interchange

2023-03-20 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109184

Richard Biener  changed:

   What|Removed |Added

 Status|NEW |ASSIGNED
   Assignee|unassigned at gcc dot gnu.org  |rguenth at gcc dot 
gnu.org

--- Comment #5 from Richard Biener  ---
I will try to have a look.