https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502

            Bug ID: 111502
           Summary: Suboptimal unaligned 2/4-byte memcpy on strict-align
                    targets
           Product: gcc
           Version: 13.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: lasse.collin at tukaani dot org
  Target Milestone: ---

I was playing with RISC-V GCC 12.2.0 from Arch Linux. I noticed
inefficient-looking assembly output in code that uses memcpy to access 32-bit
unaligned integers. I tried Godbolt with 16/32-bit integers and seems that the
same weirdness happens with RV32 & RV64 with GCC 13.2.0 and trunk, and also on
a few other targets. (Clang's output looks OK.)

For a little endian target:

#include <stdint.h>
#include <string.h>

uint32_t bytes16(const uint8_t *b)
{
    return (uint32_t)b[0]
        | ((uint32_t)b[1] << 8);
}

uint32_t copy16(const uint8_t *b)
{
    uint16_t v;
    memcpy(&v, b, sizeof(v));
    return v;
}

riscv64-linux-gnu-gcc -march=rv64gc -O2 -mtune=size

bytes16:
        lhu     a0,0(a0)
        ret

copy16:
        lhu     a0,0(a0)
        ret

That looks good because -mno-strict-align is the default.

After omitting -mtune=size, unaligned access isn't used (the output is the same
as with -mstrict-align):

riscv64-linux-gnu-gcc -march=rv64gc -O2

bytes16:
        lbu     a5,1(a0)
        lbu     a0,0(a0)
        slli    a5,a5,8
        or      a0,a5,a0
        ret

copy16:
        lbu     a4,0(a0)
        lbu     a5,1(a0)
        addi    sp,sp,-16
        sb      a4,14(sp)
        sb      a5,15(sp)
        lhu     a0,14(sp)
        addi    sp,sp,16
        jr      ra

bytes16 looks good but copy16 is weird: the bytes are copied to an aligned
location on stack and then loaded back.

On Godbolt it happens with GCC 13.2.0 on RV32, RV64, ARM64 (but only if using
-mstrict-align), MIPS64EL, and SPARC & SPARC64 (comparison needs big endian
bytes16). For ARM64 and MIPS64EL the oldest GCC on Godbolt is GCC 5.4 and the
same thing happens with that too.

32-bit reads with -O2 behave similarly. With -Os a call to memcpy is emitted
for copy32 but not for bytes32.

#include <stdint.h>
#include <string.h>

uint32_t bytes32(const uint8_t *b)
{
    return (uint32_t)b[0]
        | ((uint32_t)b[1] << 8)
        | ((uint32_t)b[2] << 16)
        | ((uint32_t)b[3] << 24);
}

uint32_t copy32(const uint8_t *b)
{
    uint32_t v;
    memcpy(&v, b, sizeof(v));
    return v;
}

riscv64-linux-gnu-gcc -march=rv64gc -O2

bytes32:
        lbu     a4,1(a0)
        lbu     a3,0(a0)
        lbu     a5,2(a0)
        lbu     a0,3(a0)
        slli    a4,a4,8
        or      a4,a4,a3
        slli    a5,a5,16
        or      a5,a5,a4
        slli    a0,a0,24
        or      a0,a0,a5
        sext.w  a0,a0
        ret

copy32:
        lbu     a2,0(a0)
        lbu     a3,1(a0)
        lbu     a4,2(a0)
        lbu     a5,3(a0)
        addi    sp,sp,-16
        sb      a2,12(sp)
        sb      a3,13(sp)
        sb      a4,14(sp)
        sb      a5,15(sp)
        lw      a0,12(sp)
        addi    sp,sp,16
        jr      ra

riscv64-linux-gnu-gcc -march=rv64gc -Os

bytes32:
        lbu     a4,1(a0)
        lbu     a5,0(a0)
        slli    a4,a4,8
        or      a4,a4,a5
        lbu     a5,2(a0)
        lbu     a0,3(a0)
        slli    a5,a5,16
        or      a5,a5,a4
        slli    a0,a0,24
        or      a0,a0,a5
        sext.w  a0,a0
        ret

copy32:
        addi    sp,sp,-32
        mv      a1,a0
        li      a2,4
        addi    a0,sp,12
        sd      ra,24(sp)
        call    memcpy@plt
        ld      ra,24(sp)
        lw      a0,12(sp)
        addi    sp,sp,32
        jr      ra

I probably cannot test any proposed fixes but I hope this report is still
useful. Thanks!

Reply via email to