https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80481
Bug ID: 80481 Summary: Unoptimal additional copy instructions Product: gcc Version: 7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: andrew.n.senkevich at gmail dot com Target Milestone: --- Created attachment 41242 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41242&action=edit test-case to reproduce Hi, as was found in pr78116 attached testcase (compiled with g++ -Ofast -fopenmp -funroll-loops -march=knl) have series of moves (1) which looks like can be avoided if set target register of previous vpermps (2) equal to move target register and rearrange order of vpermps, vmaxps according to data flow (f.e. (3) should be after (4)). .L26: vmaxps (%r10,%rax), %zmm15, %zmm1 vpermps (%rcx), %zmm9, %zmm2 (2) vmovaps %zmm2, %zmm14 (1) vpermps -64(%rcx), %zmm9, %zmm2 (2) (3) vfnmadd132ps (%r14,%rax), %zmm12, %zmm14 leal 4(%rsi), %esi vmaxps %zmm1, %zmm14, %zmm13 (4) vmovaps %zmm2, %zmm14 (1) vmaxps 64(%r10,%rax), %zmm15, %zmm1 vfnmadd132ps 64(%r14,%rax), %zmm12, %zmm14 vpermps -128(%rcx), %zmm9, %zmm2 (2) cmpl %esi, %r11d vmovups %zmm13, (%r9,%rax) leaq -256(%rcx), %rcx vmaxps %zmm1, %zmm14, %zmm13 vmovaps %zmm2, %zmm14 (1) vmaxps 128(%r10,%rax), %zmm15, %zmm1 vfnmadd132ps 128(%r14,%rax), %zmm12, %zmm14 vpermps 64(%rcx), %zmm9, %zmm2 (2) vmovups %zmm13, 64(%r9,%rax) vmaxps %zmm1, %zmm14, %zmm13 vmovaps %zmm2, %zmm14 (1) vmaxps 192(%r10,%rax), %zmm15, %zmm1 vfnmadd132ps 192(%r14,%rax), %zmm12, %zmm14 vmovups %zmm13, 128(%r9,%rax) vmaxps %zmm1, %zmm14, %zmm13 vmovups %zmm13, 192(%r9,%rax) leaq 256(%rax), %rax ja .L26 It is better visible without -funroll-loops: .L26: vpermps (%rcx), %zmm10, %zmm1 (2) leal 1(%rsi), %esi vmovaps %zmm1, %zmm2 (1) vmaxps (%r15,%rdx), %zmm3, %zmm1 vfnmadd132ps (%r12,%rdx), %zmm7, %zmm2 cmpl %esi, %r8d leaq -64(%rcx), %rcx vmaxps %zmm1, %zmm2, %zmm1 vmovups %zmm1, (%rdi,%rdx) leaq 64(%rdx), %rdx ja .L26