https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114252
Bug ID: 114252 Summary: Introducing bswapsi reduces code performance Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: gjl at gcc dot gnu.org Target Milestone: --- Created attachment 57628 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57628&action=edit GNU-C test case typedef __UINT8_TYPE__ uint8_t; typedef __UINT32_TYPE__ uint32_t; typedef uint8_t __attribute__((vector_size(4))) v4u8_t; uint32_t func1 (const uint8_t *buf) { v4u8_t v4 = { buf[1], buf[0], buf[3], buf[2] }; return (uint32_t) v4; } Compile the code with $ avr-gcc code.c -S -Os -dp with v13 the result is: func1: mov r30,r24 ; 37 [c=4 l=1] movqi_insn/0 mov r31,r25 ; 38 [c=4 l=1] movqi_insn/0 ldd r22,Z+1 ; 39 [c=4 l=1] movqi_insn/3 ld r23,Z ; 40 [c=4 l=1] movqi_insn/3 ldd r24,Z+3 ; 41 [c=4 l=1] movqi_insn/3 ldd r25,Z+2 ; 42 [c=4 l=1] movqi_insn/3 /* epilogue start */ ret ; 45 [c=0 l=1] return which is good code: insn 37, 38 move the address to pointer register Z, and then follow 4 loads, one for each byte. When compiled with v14 however: func1: mov r30,r24 ; 23 [c=4 l=2] *movhi/0 mov r31,r25 ld r22,Z ; 24 [c=16 l=4] *movsi/2 ldd r23,Z+1 ldd r24,Z+2 ldd r25,Z+3 rcall __bswapsi2 ; 25 [c=16 l=1] *bswapsi2.libgcc mov r31,r23 ; 32 [c=4 l=1] movqi_insn/0 mov r23,r25 ; 33 [c=4 l=1] movqi_insn/0 mov r25,r31 ; 34 [c=4 l=1] movqi_insn/0 mov r31,r22 ; 35 [c=4 l=1] movqi_insn/0 mov r22,r24 ; 36 [c=4 l=1] movqi_insn/0 mov r24,r31 ; 37 [c=4 l=1] movqi_insn/0 /* epilogue start */ ret ; 40 [c=0 l=1] return Target: avr Configured with: ../../source/gcc-master/configure --target=avr --disable-nls --with-dwarf2 --with-gnu-as --with-gnu-ld --disable-shared --enable-languages=c,c++ Thread model: single Supported LTO compression algorithms: zlib gcc version 14.0.1 20240303 (experimental) (GCC)