https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111591

--- Comment #16 from Kewen Lin <linkw at gcc dot gnu.org> ---
Tracing down it with template specialization, the aborting happens on

  auto vn_b = Load(dn, in_b.get());
  HWY_ASSERT_VEC_EQ(
      dw, vw_signed_max,
      SatWidenMulPairwiseAdd(
          dw, InterleaveLower(dn_u, BitCast(dn_u, vn_b), vn_unsigned_max),
          InterleaveLower(dn, vn_b, vn_signed_max)));

with "void operator()(int8_t, CappedTag<int8_t, 8> dn)"

by isolating, it doesn't get the expected result on "b0" for function

template <class DI16, class VU8, class VI8>
HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
  RebindToUnsigned<decltype(di16)> du16;
  auto a0 = And(BitCast(di16, a), Set(di16, 255));
  auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
  auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
  auto b1 = ShiftRight<8>(BitCast(di16, b));
  return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
}

specialized with 
template <> HWY_API Vec128<int16_t, 4> SatWidenMulPairwiseAdd(Simd<int16_t, 4,
0> di16, Vec128<uint8_t, 8> a, Vec128<int8_t, 8> b)

further found that the unexpected values are from ShiftLeft<8>, the tree
optimized code looks expected but the final insn sequence look in wrong order.
Either -fdisable-rtl-sched2 or -fdisable-rtl-sched1 can make it pass. With
counter, I see an unexpected insn movement in sched2 on insn 395.

...

 1436: %10:DI=0x70
      REG_EQUIV 0x70
 1438: %9:DI=0xc0
      REG_EQUIV 0xc0
 1437: %8:DI=0x1e0
      REG_EQUIV 0x1e0
 1441: %7:DI=0xd0
      REG_EQUIV 0xd0
  389: %0:V2DI=[%1:DI+%9:DI]
      REG_DEAD %9:DI
      REG_EQUAL [sfp:DI+0xc0]
 1445: %5:DI=0xb0
      REG_EQUIV 0xb0
 1714: %9:DI=0xff0000
      REG_EQUIV 0xff0000
  373: [%1:DI+0x70]=%4:DI
      REG_DEAD %4:DI
  375: [%1:DI+0x78]=%6:DI
      REG_DEAD %6:DI
 1715: %9:DI=%9:DI|0xff
 1785: %25:DI=high(unspec[`*.LC8',%2:DI] 47)
 1716: %9:DI=%9:DI&0xffffffff|%9:DI<<0x20
      REG_EQUIV 0xff00ff00ff00ff
  410: %28:DI=%1:DI+0xae
      REG_EQUAL sfp:DI+0xae
    6: %31:SI=0
      REG_EQUAL 0
 1786: %25:DI=%25:DI+low(unspec[`*.LC8',%2:DI] 47)
      REG_DEAD %2:DI
      REG_EQUAL `*.LC8'
  392: [%1:DI+%7:DI]=%0:V2DI
      REG_DEAD %7:DI
                                         // unexpected version having insn 395
moved here.
 1738: %12:V2DI=[%1:DI+%10:DI]
  376: [%1:DI+%8:DI]=%12:V2DI
      REG_DEAD %12:V2DI
      REG_DEAD %8:DI
      REG_EQUIV [sfp:DI+%8:DI]
      REG_EQUAL [sfp:DI+0x70]
  390: [%1:DI+%10:DI]=%0:V2DI            // since this store updates
[%1:DI+0x70] in 16 bytes, so the read
                                         // can't pass this  
      REG_DEAD %0:V2DI
  395: %4:DI=zero_extend([%1:DI+0x70])   //  <------ this is expected
  398: %6:DI=zero_extend([%1:DI+0x72])
  401: %7:DI=zero_extend([%1:DI+0x74])
  404: %8:DI=zero_extend([%1:DI+0x76])
  396: %4:SI=%4:SI<<0x8
  399: %6:SI=%6:SI<<0x8
  402: %7:SI=%7:SI<<0x8
  405: %8:SI=%8:SI<<0x8

 ....

the tree optimized IR for this part looks expected?

  <bb 51> [local count: 119292722]:
  v = a;
  MEM <unsigned char[16]> [(char * {ref-all})&D.38735] = MEM <unsigned
char[16]> [(char * {ref-all})&v];
  v ={v} {CLOBBER(eol)};
  vect_a_raw_0_1121.562_722 = MEM <vector(4) short int> [(short int
*)&D.38735];
  _215 = VIEW_CONVERT_EXPR<long unsigned int>(vect_a_raw_0_1121.562_722);
  _830 = _215 & 71777214294589695;
  _1549 = BIT_FIELD_REF <_830, 16, 32>;
  _1537 = BIT_FIELD_REF <_830, 16, 16>;
  _323 = BIT_FIELD_REF <_830, 16, 0>;
  v = b;
  MEM <unsigned char[16]> [(char * {ref-all})&b00] = MEM <unsigned char[16]>
[(char * {ref-all})&v]; 

                          ==> ref-all here, so should be executed before any
reads below?

  v ={v} {CLOBBER(eol)};
  v = b00;
  raw_u_1323 = v.raw[0];
  _1324 = raw_u_1323 << 8;
  v.raw[0] = _1324;
  raw_u_1403 = v.raw[1];
  _1404 = raw_u_1403 << 8;
  v.raw[1] = _1404;
  raw_u_1447 = v.raw[2];
  _1448 = raw_u_1447 << 8;
  v.raw[2] = _1448;
  raw_u_128 = v.raw[3];
  _129 = raw_u_128 << 8;
  v.raw[3] = _129;
  b01 = v;
  v ={v} {CLOBBER(eol)};
  ivtmp.577_734 = (unsigned long) &MEM <struct Vec128> [(void *)&b01 + -2B];

...

I guess there is some way to keep this kind of aliasing information after
expanding, need more investigations why sched considers it's safe to move.

Reply via email to