https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115021

--- Comment #2 from Roger Sayle <roger at nextmovesoftware dot com> ---
Here's a reduced test case that should be unaffected by the pending changes to
how V8QI shifts are expanded.  Note that the final "t -= t4" is required to
convince the register allocator to "spill".

typedef signed char v16qi __attribute__ ((__vector_size__ (16)));
// sign-extend low 3 bits to a byte.
v16qi foo (v16qi x) {
    v16qi t7 = (v16qi){7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7};
    v16qi t4 = (v16qi){4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4};
    v16qi t = x & t7;
    t ^= t4;
    t -= t4;
    return t;
}

which produces:

foo:    movl    $67372036, %eax
        vmovdqa %xmm0, %xmm2
        vpbroadcastd    %eax, %xmm1
        movl    $117901063, %eax
        vpbroadcastd    %eax, %xmm3
        vmovdqa %xmm1, %xmm0
        vmovdqa %xmm3, -24(%rsp)
        vmovdqa -24(%rsp), %xmm4
        vpternlogd      $120, %xmm2, %xmm4, %xmm0
        vpsubb  %xmm1, %xmm0, %xmm0
        ret

Reply via email to