https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124829

            Bug ID: 124829
           Summary: Missed optimization opportunity with bitfields
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: federico at kircheis dot it
  Target Milestone: ---

Consider the following c++ snippet


~~~~
#include <bit>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <span>
#include <utility>

struct data_t_little_endian {
    // b0
    uint8_t v1 : 2;
    uint8_t v2 : 1;
    uint8_t v3 : 1;
    uint8_t v4 : 4;
    // b1
    uint8_t v5 : 2;
    uint8_t padding : 6;

    bool operator==(const data_t_little_endian&) const noexcept = default;
};

struct data_t_big_endian {
    // b0
    uint8_t v4 : 4;
    uint8_t v3 : 1;
    uint8_t v2 : 1;
    uint8_t v1 : 2;
    // b1
    uint8_t padding : 6;
    uint8_t v5 : 2;
    bool operator==(const data_t_big_endian&) const noexcept = default;
};

using data_t = std::conditional<std::endian::native == std::endian::little,
                                data_t_little_endian, data_t_big_endian>::type;


data_t to_pod(std::span<const unsigned char, sizeof(data_t)> s) {
    data_t t = {};
    std::memcpy(&t, s.data(), s.size());
    return t;
}

data_t to_pod2(std::span<const unsigned char, sizeof(data_t)> s) {
    data_t t = {};

    uint8_t b0 = s[0];
    t.v1 = (b0) & 0x03;       // bits 0-1
    t.v2 = (b0 >> 2) & 0x01;  // bit  2
    t.v3 = (b0 >> 3) & 0x01;  // bit  3
    t.v4 = (b0 >> 4) & 0x0F;  // bits 4-7

    uint8_t b1 = s[1];
    t.v5 = (b1) & 0x03;  // bits 0-1

    return t;
}

void test() {
    for (unsigned char c1 = 0; c1 != 254; ++c1) {
        for (unsigned char c2 = 0; c2 != 254; ++c2) {
            unsigned char buffer[] = {c1, c2};
            auto v1 = to_pod(buffer);
            auto v2 = to_pod(buffer);
            if (v1 != v2) 
            {
                std::puts("error");
            }
        }
    }
}
~~~~

As far as I see, the behavior is the same both on big and little endian
platforms, and both to_pod and to_pod2 always produce the same result.

But: to_pod is "faster" than to_pod2, for example, on the powerpc platform I
get

~~~~
to_pod(std::span<unsigned char const, 2u>):
        lwz 9,0(4)
        lhz 9,0(9)
        sth 9,0(3)
        blr
to_pod2(std::span<unsigned char const, 2u>):
        lwz 9,0(4)
        lbz 10,0(9)
        lbz 9,1(9)
        rlwinm 10,10,8,16,23
        rlwinm 9,9,0,30,31
        or 9,9,10
        sth 9,0(3)
        blr
~~~~

and on x64

~~~~
"to_pod(std::span<unsigned char const, 2ul>)":
        movzx   eax, WORD PTR [rdi]
        ret
"to_pod2(std::span<unsigned char const, 2ul>)":
        movzx   edx, BYTE PTR [rdi]
        mov     eax, edx
        mov     ecx, edx
        shr     al, 2
        and     ecx, 3
        and     eax, 1
        sal     eax, 2
        or      eax, ecx
        mov     ecx, edx
        shr     dl, 4
        shr     cl, 3
        movzx   edx, dl
        and     ecx, 1
        sal     edx, 4
        sal     ecx, 3
        or      eax, ecx
        or      eax, edx
        movzx   edx, BYTE PTR [rdi+1]
        and     edx, 3
        sal     edx, 8
        or      eax, edx
        ret
~~~~

and arm64

~~~~
to_pod(std::span<unsigned char const, 2ul>):
        ldrh    w0, [x0]
        ret
to_pod2(std::span<unsigned char const, 2ul>):
        ldrb    w1, [x0]
        ldrb    w0, [x0, 1]
        ubfiz   w0, w0, 8, 2
        orr     w0, w0, w1
        ret
~~~~


On all platforms, test is optimized to a single return statement, thus the
optimizer seems to be able to see that to_pod and to_pod2 are equivalent, but
the generated code is not.

Reply via email to