https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93737

            Bug ID: 93737
           Summary: inline memmove for insertion into small arrays
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: msebor at gcc dot gnu.org
  Target Milestone: ---

GCC emits what's likely more efficient code for the insertion of elements into
the middle of small arrays by copying parts of the array into a temporary
buffer than it does for straight calls to memmove that achieve the same result
without the temporary buffer.

The example below shows the difference.  Clang emits the same, presumably
optimally efficient, code for both functions as GCC does for f0.

$ cat t.c && gcc -DN=32 -O2 -S -Wall -o/dev/stdout t.c
int a[N];

void f0 (int x)
{
  int b[N];
  __builtin_memcpy (b, a + 1, sizeof a - sizeof *a);
  __builtin_memcpy (a + 2, b, sizeof a - 2 * sizeof *a);
  a[1] = x;
}

void f2 (int x)
{
  __builtin_memmove (a + 2, a + 1, sizeof a - 2 * sizeof *a);
  a[1] = x;
}


        .file   "t.c"
        .text
        .p2align 4
        .globl  f0
        .type   f0, @function
f0:
.LFB0:
        .cfi_startproc
        subq    $16, %rsp
        .cfi_def_cfa_offset 24
        movdqu  a+20(%rip), %xmm5
        movdqu  a+36(%rip), %xmm4
        movdqu  a+52(%rip), %xmm3
        movdqu  a+68(%rip), %xmm2
        movdqu  a+84(%rip), %xmm1
        movdqu  a+100(%rip), %xmm0
        movups  %xmm5, a+24(%rip)
        movq    a+116(%rip), %rax
        movdqu  a+4(%rip), %xmm6
        movups  %xmm4, a+40(%rip)
        movl    %edi, a+4(%rip)
        movq    %rax, a+120(%rip)
        movups  %xmm6, a+8(%rip)
        movups  %xmm3, a+56(%rip)
        movups  %xmm2, a+72(%rip)
        movups  %xmm1, a+88(%rip)
        movups  %xmm0, a+104(%rip)
        addq    $16, %rsp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE0:
        .size   f0, .-f0
        .p2align 4
        .globl  f2
        .type   f2, @function
f2:
.LFB1:
        .cfi_startproc
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        movl    $120, %edx
        movl    %edi, %ebx
        movl    $a+4, %esi
        movl    $a+8, %edi
        call    memmove
        movl    %ebx, a+4(%rip)
        popq    %rbx
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE1:
        .size   f2, .-f2
        .globl  a
        .bss
        .align 32
        .type   a, @object
        .size   a, 128
a:
        .zero   128
        .ident  "GCC: (GNU) 10.0.1 20200212 (experimental)"
        .section        .note.GNU-stack,"",@progbits

Reply via email to