--- demo.c ---
int ispowerof2(unsigned long long argument) {
    return (argument != 0) && ((argument & argument - 1) == 0);
}
--- EOF ---

GCC 13.1    gcc -m32 -mavx -O3 # or -march=native instead of -mavx

https://gcc.godbolt.org/z/T31Gzo85W
ispowerof2(unsigned long long):
        vmovq   xmm1, QWORD PTR [esp+4]        ->    movq     xmm0, dword ptr 
[esp+4]
        xor     eax, eax                       ->    xor      eax, eax
        vpunpcklqdq     xmm0, xmm1, xmm1       # superfluous
        vptest  xmm0, xmm0                     ->    ptest    xmm0, xmm0
        je      .L1                            ->    jz       .L1
        vpcmpeqd        xmm0, xmm0, xmm0       ->    pcmpeqd  xmm1, xmm1
        xor     eax, eax                       # superfluous
        vpaddq  xmm0, xmm1, xmm0               ->    paddq    xmm1. xmm0
        vpand   xmm0, xmm0, xmm1               # superfluous
        vpunpcklqdq     xmm0, xmm0, xmm0       # superfluous
        vptest  xmm0, xmm0                     ->    ptest    xmm1, xmm0
        sete    al                             ->    setz     al
.L1:
        ret                                    ->    ret

5 out of 13 instructions are SUPERFLUOUS here!

OUCH #1: there's ANSOLUTELY no need to generate AVX instructions and
         bloat the code through VEX prefixes and longer instructions!

OUCH #2: [V]MOVQ clears the upper lane of XMM registers, there's
         ABSOLTELY no need for [V]PUNPCKLQDQ instructions.

GCC 13.1    gcc -m32 -msse4.1 -O3

https://gcc.godbolt.org/z/bqsqec6r1
ispowerof2(unsigned long long):
        movq    xmm1, QWORD PTR [esp+4]       ->    movq    xmm0, [esp+4]
        xor     eax, eax                      ->    xor     eax, eax
        movdqa  xmm0, xmm1                    # superfluous
        punpcklqdq      xmm0, xmm1            # superfluous
        ptest   xmm0, xmm0                    ->    ptest   xmm0, xmm0
        je      .L1                           ->    jz      .L1
        pcmpeqd xmm0, xmm0                    ->    pcmpeqq xmm1, xmm1
        xor     eax, eax                      # superfluous
        paddq   xmm0, xmm1                    ->    paddq   xmm1, xmm0
        pand    xmm0, xmm1                    # superfluous
        punpcklqdq      xmm0, xmm0            # superfluous
        ptest   xmm0, xmm0                    ->    ptest   xmm1, xmm0
        sete    al                            ->    setz    al
.L1:
        ret                                   ->    ret

5 out of 14 instructions are superfluous here, or 18 of 50 bytes!

OUCH #3/#4: see above!

Will GCC eventually generate proper SSE4.1/AVX code?

Stefan

Reply via email to