https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120233
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
As for recognizing patterns,
_1 = *b_10(D);
_2 = _1 >> 8;
_17 = {_2, _1};
_5 = MEM[(short int *)b_10(D) + 2B];
_6 = _5 >> 8;
_16 = {_6, _5};
vect__3.3_18 = VEC_PACK_TRUNC_EXPR <_17, _16>;
is
_A = {_1, _5}
_B = VIEW_CONVERT <V4QI> (_A);
_C = VEC_PERM (_B, _B, { 1, 0, 3, 2 });
with the opportunity to detect {_1, _5} as V4QI load. IMO "perfect" for
the bswap byte flow tracking machinery. And of course a missed
optimization for the vectorization itself where we fail at SLP discovery.
We could split
_1 = *b_10(D);
_2 = _1 >> 8;
_3 = (char) _2;
_4 = (char) _1;
into
_2 = *((char*)b_10(D) + 1);
_4 = *(char *)b_10(D);
which would resolve this but might not be a win. Alternatively
_1 = *b_10(D);
_2 = BIT_FIELD_REF <_1, 8, 8>;
_4 = BIT_FIELD_REF <_1, 8, 0>;
but I think the latter is folded to a truncation. Also SLP discovery
does not support BIT_FIELD_REF of non-vectors, so it would not help
without enhancing that.
gimple testcase for SLP of foo2:
void __GIMPLE(ssa,startwith("slp"))
foo2 (char* a, short* __restrict b)
{
short _2;
// short _5;
short _4;
// short _8;
char _6;
char _7;
char _9;
char _10;
__BB(2):
_2 = __MEM <short> (b_3(D));
// _5 = _2 >> 8;
// _6 = (char) _5;
_6 = __BIT_FIELD_REF <char> (_2, 8, 8);
_7 = (char) _2;
_4 = __MEM <short> (b_3(D) + _Literal (short *) 2);
// _8 = _4 >> 8;
// _9 = (char) _8;
_9 = __BIT_FIELD_REF <char> (_4, 8, 8);
_10 = (char) _4;
__MEM <char> (a_1(D)) = _6;
__MEM <char> (a_1(D) + _Literal (char *) 1) = _7;
__MEM <char> (a_1(D) + _Literal (char *) 2) = _9;
__MEM <char> (a_1(D) + _Literal (char *) 3) = _10;
return;
}