https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122604
Bug ID: 122604
Summary: Missed optimization opportunity: flatten branches with
function call into one function call that uses CMOV
Product: gcc
Version: 15.2.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: nicula.iccc at gmail dot com
Target Milestone: ---
Ideally GCC should figure out that foo_v1() can be optimized like foo_v2(). Any
idea why this is not happening? Code:
#include <cstdint>
struct Entry { int _v[32]; unsigned char _s[32]; };
__always_inline void common_part(Entry *e, int b, int s)
{
int t = (b*3 + s) ^ 0x9e3779b9;
e->_v[s] = t;
e->_s[s] = (unsigned char)((b ^ s) & 255);
}
void foo_v1(Entry* e, bool found, unsigned m, int b) {
int k = found ? (b & 31) : -1;
if (k == -1)
common_part(e, b, __builtin_ctz(m));
else
common_part(e, b, k);
}
void foo_v2(Entry* e, bool found, unsigned m, int b) {
int k = found ? (b & 31) : -1;
common_part(e, b, k == -1 ? __builtin_ctz(m) : k);
}
Assembly (-std=c++23 -O3 -march=znver1):
foo_v1(Entry*, bool, unsigned int, int):
lea eax, [rcx+rcx*2]
test sil, sil
je .L2
mov esi, ecx
mov edx, ecx
and esi, 31
and edx, 31
add eax, esi
xor ecx, esi
xor eax, -1640531527
mov DWORD PTR [rdi+rdx*4], eax
mov BYTE PTR [rdi+128+rdx], cl
ret
.L2:
tzcnt esi, edx
add eax, esi
xor ecx, esi
xor eax, -1640531527
mov DWORD PTR [rdi+rsi*4], eax
mov BYTE PTR [rdi+128+rsi], cl
ret
foo_v2(Entry*, bool, unsigned int, int):
mov eax, ecx
tzcnt edx, edx
and eax, 31
test sil, sil
cmovne edx, eax
lea eax, [rcx+rcx*2]
add eax, edx
movsx rsi, edx
xor ecx, edx
xor eax, -1640531527
mov DWORD PTR [rdi+rsi*4], eax
mov BYTE PTR [rdi+128+rsi], cl
ret
Godbolt: https://godbolt.org/z/8zv1s47hr
Note: Clang does not figure it out either.