https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107837
Bug ID: 107837
Summary: Missed optimization: Using memcpy to load a struct
unnecessary uses stack space
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: chfast at gmail dot com
Target Milestone: ---
I have a simple struct with array uint64_t[4]. When using memcpy() load it from
a storage of bytes and then performing some additional operations, a temporary
object on the stack is created.
struct uint256
{
unsigned long v[4];
};
void load_bad(uint256* o, const char* src) noexcept
{
uint256 x;
__builtin_memcpy(&x, src, sizeof(x));
uint256 y;
y.v[0] = __builtin_bswap64(x.v[3]);
y.v[1] = __builtin_bswap64(x.v[2]);
y.v[2] = __builtin_bswap64(x.v[1]);
y.v[3] = __builtin_bswap64(x.v[0]);
*o = y;
}
load_bad(uint256*, char const*):
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi+16]
movaps XMMWORD PTR [rsp-40], xmm0
mov rdx, QWORD PTR [rsp-32]
mov rax, QWORD PTR [rsp-40]
movaps XMMWORD PTR [rsp-24], xmm1
mov rsi, QWORD PTR [rsp-16]
mov rcx, QWORD PTR [rsp-24]
bswap rdx
bswap rax
mov QWORD PTR [rdi+16], rdx
bswap rsi
bswap rcx
mov QWORD PTR [rdi], rsi
mov QWORD PTR [rdi+8], rcx
mov QWORD PTR [rdi+24], rax
ret
The workaround is to use reinterpret_cast.
https://godbolt.org/z/WevYch8nv