https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114252

            Bug ID: 114252
           Summary: Introducing bswapsi reduces code performance
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gjl at gcc dot gnu.org
  Target Milestone: ---

Created attachment 57628
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57628&action=edit
GNU-C test case

typedef __UINT8_TYPE__ uint8_t;
typedef __UINT32_TYPE__ uint32_t;

typedef uint8_t __attribute__((vector_size(4))) v4u8_t;

uint32_t func1 (const uint8_t *buf) {
    v4u8_t v4 = { buf[1], buf[0], buf[3], buf[2] };

    return (uint32_t) v4;
}

Compile the code with

$ avr-gcc code.c -S -Os -dp

with v13 the result is:


func1:
        mov r30,r24      ;  37  [c=4 l=1]  movqi_insn/0
        mov r31,r25      ;  38  [c=4 l=1]  movqi_insn/0
        ldd r22,Z+1      ;  39  [c=4 l=1]  movqi_insn/3
        ld r23,Z                 ;  40  [c=4 l=1]  movqi_insn/3
        ldd r24,Z+3      ;  41  [c=4 l=1]  movqi_insn/3
        ldd r25,Z+2      ;  42  [c=4 l=1]  movqi_insn/3
/* epilogue start */
        ret              ;  45  [c=0 l=1]  return

which is good code: insn 37, 38 move the address to pointer register Z, and
then follow 4 loads, one for each byte.

When compiled with v14 however:

func1:
        mov r30,r24      ;  23  [c=4 l=2]  *movhi/0
        mov r31,r25
        ld r22,Z         ;  24  [c=16 l=4]  *movsi/2
        ldd r23,Z+1
        ldd r24,Z+2
        ldd r25,Z+3
        rcall __bswapsi2         ;  25  [c=16 l=1]  *bswapsi2.libgcc
        mov r31,r23      ;  32  [c=4 l=1]  movqi_insn/0
        mov r23,r25      ;  33  [c=4 l=1]  movqi_insn/0
        mov r25,r31      ;  34  [c=4 l=1]  movqi_insn/0
        mov r31,r22      ;  35  [c=4 l=1]  movqi_insn/0
        mov r22,r24      ;  36  [c=4 l=1]  movqi_insn/0
        mov r24,r31      ;  37  [c=4 l=1]  movqi_insn/0
/* epilogue start */
        ret              ;  40  [c=0 l=1]  return


Target: avr
Configured with: ../../source/gcc-master/configure --target=avr --disable-nls
--with-dwarf2 --with-gnu-as --with-gnu-ld --disable-shared
--enable-languages=c,c++
Thread model: single
Supported LTO compression algorithms: zlib
gcc version 14.0.1 20240303 (experimental) (GCC)

Reply via email to