http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56511



             Bug #: 56511

           Summary: memcpy misses chance to use AVX instructions

    Classification: Unclassified

           Product: gcc

           Version: 4.7.2

            Status: UNCONFIRMED

          Severity: normal

          Priority: P3

         Component: rtl-optimization

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: jyass...@gcc.gnu.org





When operating on sufficiently aligned storage, memcpy should be able to use

vector instructions.



$ cat test.c

#include <string.h>



typedef float vec __attribute__((vector_size(32)));

typedef struct S {

  vec v;

  char __attribute__((aligned(__alignof__(vec)))) c[sizeof(vec)];

} S;

void assign_vec(S* s, const vec* v) { s->v = *v; }

void memcpy_vec(S* s, const vec* v) { memcpy(&s->v, v, sizeof(vec)); }

void memcpy_char(S* s, const vec* v) { memcpy(s->c, v, sizeof(vec)); }



$ gcc -mavx -S test.c -O2  -Wall -o - 

        .file   "test.c"

        .text

        .p2align 4,,15

        .globl  assign_vec

        .type   assign_vec, @function

assign_vec:

.LFB12:

        .cfi_startproc

        vmovaps (%rsi), %ymm0

        vmovaps %ymm0, (%rdi)

        vzeroupper

        ret

        .cfi_endproc

.LFE12:

        .size   assign_vec, .-assign_vec

        .p2align 4,,15

        .globl  memcpy_vec

        .type   memcpy_vec, @function

memcpy_vec:

.LFB13:

        .cfi_startproc

        movq    (%rsi), %rax

        movq    %rax, (%rdi)

        movq    8(%rsi), %rax

        movq    %rax, 8(%rdi)

        movq    16(%rsi), %rax

        movq    %rax, 16(%rdi)

        movq    24(%rsi), %rax

        movq    %rax, 24(%rdi)

        ret

        .cfi_endproc

.LFE13:

        .size   memcpy_vec, .-memcpy_vec

        .p2align 4,,15

        .globl  memcpy_char

        .type   memcpy_char, @function

memcpy_char:

.LFB14:

        .cfi_startproc

        movq    (%rsi), %rdx

        movq    %rdx, 32(%rdi)

        movq    8(%rsi), %rdx

        movq    %rdx, 40(%rdi)

        movq    16(%rsi), %rdx

        movq    %rdx, 48(%rdi)

        movq    24(%rsi), %rdx

        movq    %rdx, 56(%rdi)

        ret

        .cfi_endproc

.LFE14:

        .size   memcpy_char, .-memcpy_char





I don't have a gcc-4.8 around to test with, but I believe it's also missing

this optimization.

Reply via email to