https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115021
--- Comment #2 from Roger Sayle <roger at nextmovesoftware dot com> --- Here's a reduced test case that should be unaffected by the pending changes to how V8QI shifts are expanded. Note that the final "t -= t4" is required to convince the register allocator to "spill". typedef signed char v16qi __attribute__ ((__vector_size__ (16))); // sign-extend low 3 bits to a byte. v16qi foo (v16qi x) { v16qi t7 = (v16qi){7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7}; v16qi t4 = (v16qi){4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4}; v16qi t = x & t7; t ^= t4; t -= t4; return t; } which produces: foo: movl $67372036, %eax vmovdqa %xmm0, %xmm2 vpbroadcastd %eax, %xmm1 movl $117901063, %eax vpbroadcastd %eax, %xmm3 vmovdqa %xmm1, %xmm0 vmovdqa %xmm3, -24(%rsp) vmovdqa -24(%rsp), %xmm4 vpternlogd $120, %xmm2, %xmm4, %xmm0 vpsubb %xmm1, %xmm0, %xmm0 ret