https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114339

--- Comment #6 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
vectorizer generates:

  mask_patt_21.19_58 = vect_perm_even_49 >= vect_cst__57;
  mask_patt_21.19_59 = vect_perm_even_55 >= vect_cst__57;
  vexit_reduc_63 = mask_patt_21.19_58 | mask_patt_21.19_59;
  if (vexit_reduc_63 != { 0, 0 })
    goto <bb 14>; [20.00%]
  else
    goto <bb 5>; [80.00%]

This is changed at loopdone into:

  delays[3].nonprimary_delay = 129600;
  vect_cst__57 = {tdiff_6, tdiff_6};
  mask_patt_21.19_58 = vect_cst__57 <= { 0, 0 };
  mask_patt_21.19_59 = vect_cst__57 <= { 0, 0x7FFFFFFFFFFFFFFF };
  vexit_reduc_63 = mask_patt_21.19_58 | mask_patt_21.19_59;
  if (vexit_reduc_63 != { 0, 0 })
    goto <bb 3>; [20.00%]
  else
    goto <bb 7>; [80.00%]

or in other words, if there's any value where the compare succeeds, find it and
return.
This looks correct to me.

It could be that my AVX is rusty but, this generates:

   vmovdqa 0xf9c(%rip),%xmm1        # 0x402010 
   mov    $0x1,%eax
   vmovq  %rcx,%xmm3
   vmovdqa %xmm0,(%rsp)
   vpunpcklqdq %xmm3,%xmm3,%xmm2
   vmovdqa %xmm0,0x10(%rsp)
   vpcmpgtq %xmm2,%xmm1,%xmm1#
   vmovdqa %xmm0,0x20(%rsp)
   vmovq  %rax,%xmm0
   vpunpcklqdq %xmm0,%xmm0,%xmm0
   movl   $0x1fa40,0x38(%rsp)
   vpcmpgtq %xmm2,%xmm0,%xmm0
   vpor   %xmm1,%xmm0,%xmm0
   vptest %xmm0,%xmm0

which looks off, particularly for the second compare it look like it doesn't do
a load but instead just duplicates the constant 1.
gdb seems to confirm this. At the first compare:

(gdb) p $xmm2.v2_int64
$4 = {10412095, 10412095}
(gdb) p $xmm0.v2_int64
$5 = {0, 0}

which is what's expected, but at the second compare:

(gdb) p $xmm2.v2_int64
$7 = {10412095, 10412095}
(gdb) p $xmm0.v2_int64
$6 = {1, 1}

at the second it's comparing {1, 1} instead of {0, 0x7FFFFFFFFFFFFFFF}.

on AArch64 where it doesn't fail the comparison is:

   movi    v29.4s, 0
   add     x1, sp, 16
   ldr     x5, [x0, 8]
   mov     w0, 64064
   movk    w0, 0x1, lsl 16
   add     x3, sp, 48
   str     q29, [sp, 64]
   mov     x2, 57407
   mov     x4, 9223372036854775807
   str     x4, [sp, 64]
   movk    x2, 0x9e, lsl 16
   str     w0, [sp, 72]
   sub     x2, x2, x5
   stp     q29, q29, [x1]
   dup     v27.2d, x2
   ld2     {v30.2d - v31.2d}, [x1]
   str     q29, [sp, 48]
   ld2     {v28.2d - v29.2d}, [x3]
   cmge    v30.2d, v30.2d, v27.2d
   cmge    v28.2d, v28.2d, v27.2d
   orr     v30.16b, v30.16b, v28.16b
   umaxp   v30.4s, v30.4s, v30.4s
   fmov    x0, d30
   cbnz    x0, .L12

which has v30.2d being {0, 0} and v28.2d being {0, 0x7FFFFFFFFFFFFFFF} as
expected...

On AArch64 we don't inline the constants because whatever is propagating the
constants can't understand the LOAD_LANES:

  mask_patt_19.21_50 = vect__2.16_44 >= vect_cst__49;
  mask_patt_19.21_51 = vect__2.19_47 >= vect_cst__49;
  vexit_reduc_55 = mask_patt_19.21_50 | mask_patt_19.21_51;
  if (vexit_reduc_55 != { 0, 0 })
    goto <bb 3>; [20.00%]
  else
    goto <bb 7>; [80.00%]

so could this be another expansion bug?

Note that a simpler reproducer is this:

---
long tdiff = 10412095;

int main() {
  struct {
    long maximum;
    int nonprimary_delay;
  } delays[] = {{}, {}, {}, {9223372036854775807, 36 * 60 * 60}};

  for (unsigned i = 0; i < sizeof(delays) / sizeof(delays[0]); ++i)
    if (tdiff <= delays[i].maximum)
      return delays[i].nonprimary_delay;

  __builtin_abort();
}
---

the key point is that we're not allowed to constprop tdiff at GIMPLE. If we do,
e.g:

int main() {
  struct {
    long maximum;
    int nonprimary_delay;
  } delays[] = {{}, {}, {}, {9223372036854775807, 36 * 60 * 60}};
  long tdiff = 10412095;

  for (unsigned i = 0; i < sizeof(delays) / sizeof(delays[0]); ++i)
    if (tdiff <= delays[i].maximum)
      return delays[i].nonprimary_delay;

  __builtin_abort();
}

then after vectorization the const prop the entire expression is evaluated at
GIMPLE and it gets the right result.

This makes me believe it's a target expansion bug.

Reply via email to