https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122604

            Bug ID: 122604
           Summary: Missed optimization opportunity: flatten branches with
                    function call into one function call that uses CMOV
           Product: gcc
           Version: 15.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: nicula.iccc at gmail dot com
  Target Milestone: ---

Ideally GCC should figure out that foo_v1() can be optimized like foo_v2(). Any
idea why this is not happening? Code:

    #include <cstdint>

    struct Entry { int _v[32]; unsigned char _s[32]; };

    __always_inline void common_part(Entry *e, int b, int s)
    {
        int t = (b*3 + s) ^ 0x9e3779b9;
        e->_v[s] = t;
        e->_s[s] = (unsigned char)((b ^ s) & 255);
    }

    void foo_v1(Entry* e, bool found, unsigned m, int b) {
        int k = found ? (b & 31) : -1;
        if (k == -1)
            common_part(e, b, __builtin_ctz(m));
        else
            common_part(e, b, k);
    }

    void foo_v2(Entry* e, bool found, unsigned m, int b) {
        int k = found ? (b & 31) : -1;
        common_part(e, b, k == -1 ? __builtin_ctz(m) : k);
    }

Assembly (-std=c++23 -O3 -march=znver1):

    foo_v1(Entry*, bool, unsigned int, int):
            lea     eax, [rcx+rcx*2]
            test    sil, sil
            je      .L2
            mov     esi, ecx
            mov     edx, ecx
            and     esi, 31
            and     edx, 31
            add     eax, esi
            xor     ecx, esi
            xor     eax, -1640531527
            mov     DWORD PTR [rdi+rdx*4], eax
            mov     BYTE PTR [rdi+128+rdx], cl
            ret
    .L2:
            tzcnt   esi, edx
            add     eax, esi
            xor     ecx, esi
            xor     eax, -1640531527
            mov     DWORD PTR [rdi+rsi*4], eax
            mov     BYTE PTR [rdi+128+rsi], cl
            ret

    foo_v2(Entry*, bool, unsigned int, int):
            mov     eax, ecx
            tzcnt   edx, edx
            and     eax, 31
            test    sil, sil
            cmovne  edx, eax
            lea     eax, [rcx+rcx*2]
            add     eax, edx
            movsx   rsi, edx
            xor     ecx, edx
            xor     eax, -1640531527
            mov     DWORD PTR [rdi+rsi*4], eax
            mov     BYTE PTR [rdi+128+rsi], cl
            ret

Godbolt: https://godbolt.org/z/8zv1s47hr

Note: Clang does not figure it out either.

Reply via email to