https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124829
Bug ID: 124829
Summary: Missed optimization opportunity with bitfields
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: federico at kircheis dot it
Target Milestone: ---
Consider the following c++ snippet
~~~~
#include <bit>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <span>
#include <utility>
struct data_t_little_endian {
// b0
uint8_t v1 : 2;
uint8_t v2 : 1;
uint8_t v3 : 1;
uint8_t v4 : 4;
// b1
uint8_t v5 : 2;
uint8_t padding : 6;
bool operator==(const data_t_little_endian&) const noexcept = default;
};
struct data_t_big_endian {
// b0
uint8_t v4 : 4;
uint8_t v3 : 1;
uint8_t v2 : 1;
uint8_t v1 : 2;
// b1
uint8_t padding : 6;
uint8_t v5 : 2;
bool operator==(const data_t_big_endian&) const noexcept = default;
};
using data_t = std::conditional<std::endian::native == std::endian::little,
data_t_little_endian, data_t_big_endian>::type;
data_t to_pod(std::span<const unsigned char, sizeof(data_t)> s) {
data_t t = {};
std::memcpy(&t, s.data(), s.size());
return t;
}
data_t to_pod2(std::span<const unsigned char, sizeof(data_t)> s) {
data_t t = {};
uint8_t b0 = s[0];
t.v1 = (b0) & 0x03; // bits 0-1
t.v2 = (b0 >> 2) & 0x01; // bit 2
t.v3 = (b0 >> 3) & 0x01; // bit 3
t.v4 = (b0 >> 4) & 0x0F; // bits 4-7
uint8_t b1 = s[1];
t.v5 = (b1) & 0x03; // bits 0-1
return t;
}
void test() {
for (unsigned char c1 = 0; c1 != 254; ++c1) {
for (unsigned char c2 = 0; c2 != 254; ++c2) {
unsigned char buffer[] = {c1, c2};
auto v1 = to_pod(buffer);
auto v2 = to_pod(buffer);
if (v1 != v2)
{
std::puts("error");
}
}
}
}
~~~~
As far as I see, the behavior is the same both on big and little endian
platforms, and both to_pod and to_pod2 always produce the same result.
But: to_pod is "faster" than to_pod2, for example, on the powerpc platform I
get
~~~~
to_pod(std::span<unsigned char const, 2u>):
lwz 9,0(4)
lhz 9,0(9)
sth 9,0(3)
blr
to_pod2(std::span<unsigned char const, 2u>):
lwz 9,0(4)
lbz 10,0(9)
lbz 9,1(9)
rlwinm 10,10,8,16,23
rlwinm 9,9,0,30,31
or 9,9,10
sth 9,0(3)
blr
~~~~
and on x64
~~~~
"to_pod(std::span<unsigned char const, 2ul>)":
movzx eax, WORD PTR [rdi]
ret
"to_pod2(std::span<unsigned char const, 2ul>)":
movzx edx, BYTE PTR [rdi]
mov eax, edx
mov ecx, edx
shr al, 2
and ecx, 3
and eax, 1
sal eax, 2
or eax, ecx
mov ecx, edx
shr dl, 4
shr cl, 3
movzx edx, dl
and ecx, 1
sal edx, 4
sal ecx, 3
or eax, ecx
or eax, edx
movzx edx, BYTE PTR [rdi+1]
and edx, 3
sal edx, 8
or eax, edx
ret
~~~~
and arm64
~~~~
to_pod(std::span<unsigned char const, 2ul>):
ldrh w0, [x0]
ret
to_pod2(std::span<unsigned char const, 2ul>):
ldrb w1, [x0]
ldrb w0, [x0, 1]
ubfiz w0, w0, 8, 2
orr w0, w0, w1
ret
~~~~
On all platforms, test is optimized to a single return statement, thus the
optimizer seems to be able to see that to_pod and to_pod2 are equivalent, but
the generated code is not.