https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338
Bug ID: 61338 Summary: too many permutation in a vectorized "reverse loop" Product: gcc Version: 4.9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: vincenzo.innocente at cern dot ch in this example gcc generates 4 permutations for foo (while none is required) On the positive side the code for bar (which is a more realistic use case) seems optimal. float x[1024]; float y[1024]; float z[1024]; void foo() { for (int i=0; i<512; ++i) x[1023-i] += y[1023-i]*z[512-i]; } void bar() { for (int i=0; i<512; ++i) x[1023-i] += y[i]*z[i+512]; } c++ -Ofast -march=haswell -S revloop.cc; cat revloop.s __Z3foov: LFB0: vmovdqa LC0(%rip), %ymm2 xorl %eax, %eax leaq 4064+_x(%rip), %rdx leaq 4064+_y(%rip), %rsi leaq 2020+_z(%rip), %rcx .align 4,0x90 L2: vpermd (%rdx,%rax), %ymm2, %ymm0 vpermd (%rcx,%rax), %ymm2, %ymm1 vpermd (%rsi,%rax), %ymm2, %ymm3 vfmadd231ps %ymm1, %ymm3, %ymm0 vpermd %ymm0, %ymm2, %ymm0 vmovaps %ymm0, (%rdx,%rax) subq $32, %rax cmpq $-2048, %rax jne L2 vzeroupper ret LFE0: .section __TEXT,__text_cold,regular,pure_instructions LCOLDE1: .text LHOTE1: .section __TEXT,__text_cold,regular,pure_instructions LCOLDB2: .text LHOTB2: .align 4,0x90 .globl __Z3barv __Z3barv: LFB1: vmovdqa LC0(%rip), %ymm1 leaq 2048+_z(%rip), %rdx leaq _y(%rip), %rcx leaq 4064+_x(%rip), %rax leaq 4096+_z(%rip), %rsi .align 4,0x90 L6: vmovaps (%rdx), %ymm2 addq $32, %rdx vpermd (%rax), %ymm1, %ymm0 addq $32, %rcx vfmadd231ps -32(%rcx), %ymm2, %ymm0 subq $32, %rax vpermd %ymm0, %ymm1, %ymm0 vmovaps %ymm0, 32(%rax) cmpq %rsi, %rdx jne L6 vzeroupper ret LFE1: