https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82730

            Bug ID: 82730
           Summary: extra store/reload of an XMM for every byte extracted
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Keywords: missed-optimization, ssemmx
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: peter at cordes dot ca
  Target Milestone: ---
            Target: x86_64-*-*, i?86-*-*

#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>

void p128_as_u8hex(__m128i in) {
    _Alignas(16) uint8_t v[16];
    _mm_store_si128((__m128i*)v, in);

    printf("v16.u8: %#x %#x %#x %#x | %#x %#x %#x %#x | %#x %#x %#x %#x | %#x
%#x %#x %#x\n",
           v[0], v[1],  v[2],  v[3],  v[4],  v[5],  v[6],  v[7],
           v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]);
}

https://godbolt.org/g/yoikq9
-O3  (or -march= anything with -mno-sse4 for pextrb)

        subq    $288, %rsp                   # 288 bytes!!!
        movl    $.LC0, %edi
        movaps  %xmm0, 8(%rsp)               # store
        movdqa  8(%rsp), %xmm6               # reload twice...
        movdqa  8(%rsp), %xmm1
        movaps  %xmm6, 184(%rsp)             # spill somewhere else
        movzbl  199(%rsp), %eax              # v[15]
        movaps  %xmm1, 264(%rsp)
        movzbl  8(%rsp), %esi                # v[0]
        movaps  %xmm1, 248(%rsp)
        ...
        pushq   %rax                         # v[15]

        movdqa  16(%rsp), %xmm7
        movaps  %xmm7, 176(%rsp)
        movzbl  190(%rsp), %eax
        pushq   %rax                         # v[14]

        movdqa  24(%rsp), %xmm0
        movaps  %xmm0, 168(%rsp)
        movzbl  181(%rsp), %eax
        pushq   %rax
        ...
        xorl    %eax, %eax
        call    printf
        addq    $376, %rsp
        ret

This is pretty hilariously bad, especially compared to the scalar code that
gcc6.3 produces:

        subq    $32, %rsp
        movq    %xmm0, %r9
        movq    %xmm0, %rcx
            # ok this is a bit silly vs. a scalar mov.
            # very few CPUs can do parallel movq so there's a resource-conflict
anyway making this no better than a GP->GP mov
        movaps  %xmm0, 8(%rsp)
        movq    16(%rsp), %rax        # high half
        shrq    $32, %r9
        shrq    $16, %rcx
        movq    %xmm0, %r8
        movq    %xmm0, %rdx
        movzbl  %cl, %ecx
        movzbl  %r8b, %esi
        movzbl  %dh, %edx             # using dh to save on shifts
        movzbl  %r9b, %r9d
        shrl    $24, %r8d
        movq    %rax, %rdi
        shrq    $56, %rdi
        pushq   %rdi
        ...

Not perfect (related to bug 67072), but at least doesn't do a chain of vector
copies all over the place.

--------

OTOH, we could vectorize the unpack and store to stack memory in 16B chunks. 
This is much more profitable for 32-bit mode, where all args are stack args,
and where a 16B vector holds 4 args instead of 2.  e.g. movzxbd or 2-step
punpck with zeros.

For printing as 32-bit or 64-bit integers, we can just store the vector to the
stack instead of getting each element out separately!  (Should I report that as
a separate missed optimization, for 

void p128_as_u32hex(__m128i in) {
    //const uint32_t *v = (const uint32_t*) &in;
    alignas(16) uint32_t v[4];
    _mm_store_si128((__m128i*)v, in);
    printf("v4.u32: %#x %#x %#x %#x\n", v[0], v[1], v[2], v[3]);
}

where we get (with gcc -O3 -m32)

        pshufd  $255, %xmm0, %xmm1
        movd    %xmm1, %eax
        movdqa  %xmm0, %xmm1
        pushl   %eax
        punpckhdq       %xmm0, %xmm1
        movd    %xmm1, %eax
        pshufd  $85, %xmm0, %xmm1
        pushl   %eax
        ...

instead of a single movaps store.  Or for printing as uint64_t, we get

        movhps  %xmm0, 20(%esp)
        pushl   24(%esp)
        pushl   24(%esp)
        movq    %xmm0, 28(%esp)
        pushl   32(%esp)
        pushl   32(%esp)

Reply via email to