https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338

            Bug ID: 61338
           Summary: too many permutation in a vectorized "reverse loop"
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vincenzo.innocente at cern dot ch

in this example gcc generates 4 permutations for foo (while none is required)
On the positive side the code for bar (which is a more realistic use case)
seems optimal.

float x[1024];
float y[1024];
float z[1024];

void foo() {
  for (int i=0; i<512; ++i)
    x[1023-i] += y[1023-i]*z[512-i];
}


void bar() {
  for (int i=0; i<512; ++i)
    x[1023-i] += y[i]*z[i+512];
}

c++ -Ofast -march=haswell -S revloop.cc; cat revloop.s

__Z3foov:
LFB0:
    vmovdqa    LC0(%rip), %ymm2
    xorl    %eax, %eax
    leaq    4064+_x(%rip), %rdx
    leaq    4064+_y(%rip), %rsi
    leaq    2020+_z(%rip), %rcx
    .align 4,0x90
L2:
    vpermd    (%rdx,%rax), %ymm2, %ymm0
    vpermd    (%rcx,%rax), %ymm2, %ymm1
    vpermd    (%rsi,%rax), %ymm2, %ymm3
    vfmadd231ps    %ymm1, %ymm3, %ymm0
    vpermd    %ymm0, %ymm2, %ymm0
    vmovaps    %ymm0, (%rdx,%rax)
    subq    $32, %rax
    cmpq    $-2048, %rax
    jne    L2
    vzeroupper
    ret
LFE0:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
    .text
LHOTE1:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
    .text
LHOTB2:
    .align 4,0x90
    .globl __Z3barv
__Z3barv:
LFB1:
    vmovdqa    LC0(%rip), %ymm1
    leaq    2048+_z(%rip), %rdx
    leaq    _y(%rip), %rcx
    leaq    4064+_x(%rip), %rax
    leaq    4096+_z(%rip), %rsi
    .align 4,0x90
L6:
    vmovaps    (%rdx), %ymm2
    addq    $32, %rdx
    vpermd    (%rax), %ymm1, %ymm0
    addq    $32, %rcx
    vfmadd231ps    -32(%rcx), %ymm2, %ymm0
    subq    $32, %rax
    vpermd    %ymm0, %ymm1, %ymm0
    vmovaps    %ymm0, 32(%rax)
    cmpq    %rsi, %rdx
    jne    L6
    vzeroupper
    ret
LFE1:

Reply via email to