https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120233

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
As for recognizing patterns,

  _1 = *b_10(D);
  _2 = _1 >> 8;
  _17 = {_2, _1};
  _5 = MEM[(short int *)b_10(D) + 2B];
  _6 = _5 >> 8;
  _16 = {_6, _5};
  vect__3.3_18 = VEC_PACK_TRUNC_EXPR <_17, _16>;

is 

  _A = {_1, _5}
  _B = VIEW_CONVERT <V4QI> (_A);
  _C = VEC_PERM (_B, _B, { 1, 0, 3, 2 });

with the opportunity to detect {_1, _5} as V4QI load.  IMO "perfect" for
the bswap byte flow tracking machinery.  And of course a missed
optimization for the vectorization itself where we fail at SLP discovery.

We could split

  _1 = *b_10(D);
  _2 = _1 >> 8;
  _3 = (char) _2;
  _4 = (char) _1;

into

  _2 = *((char*)b_10(D) + 1);
  _4 = *(char *)b_10(D);

which would resolve this but might not be a win.  Alternatively

  _1 = *b_10(D);
  _2 = BIT_FIELD_REF <_1, 8, 8>;
  _4 = BIT_FIELD_REF <_1, 8, 0>;

but I think the latter is folded to a truncation.  Also SLP discovery
does not support BIT_FIELD_REF of non-vectors, so it would not help
without enhancing that.

gimple testcase for SLP of foo2:

void __GIMPLE(ssa,startwith("slp"))
foo2 (char* a, short* __restrict b)
{
  short _2;
//  short _5;
  short _4;
//  short _8;
  char _6;
  char _7;
  char _9;
  char _10;

  __BB(2):
  _2 = __MEM <short> (b_3(D));
//  _5 = _2 >> 8;
//  _6 = (char) _5;
  _6 = __BIT_FIELD_REF <char> (_2, 8, 8);
  _7 = (char) _2;
  _4 = __MEM <short> (b_3(D) + _Literal (short *) 2);
//  _8 = _4 >> 8;
//  _9 = (char) _8;
  _9 = __BIT_FIELD_REF <char> (_4, 8, 8);
  _10 = (char) _4;
  __MEM <char> (a_1(D)) = _6;
  __MEM <char> (a_1(D) + _Literal (char *) 1) = _7;
  __MEM <char> (a_1(D) + _Literal (char *) 2) = _9;
  __MEM <char> (a_1(D) + _Literal (char *) 3) = _10;
  return;
}

Reply via email to