https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80481

            Bug ID: 80481
           Summary: Unoptimal additional copy instructions
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: andrew.n.senkevich at gmail dot com
  Target Milestone: ---

Created attachment 41242
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41242&action=edit
test-case to reproduce

Hi,

as was found in pr78116 attached testcase (compiled with g++ -Ofast -fopenmp
-funroll-loops -march=knl) have series of moves (1) which looks like can be
avoided if set target register of previous vpermps (2) equal to move target
register and rearrange order of vpermps, vmaxps according to data flow (f.e.
(3) should be after (4)).

.L26:
        vmaxps  (%r10,%rax), %zmm15, %zmm1
        vpermps (%rcx), %zmm9, %zmm2                       (2)
        vmovaps %zmm2, %zmm14                              (1)
        vpermps -64(%rcx), %zmm9, %zmm2                    (2) (3)
        vfnmadd132ps    (%r14,%rax), %zmm12, %zmm14            
        leal    4(%rsi), %esi
        vmaxps  %zmm1, %zmm14, %zmm13                          (4) 
        vmovaps %zmm2, %zmm14                              (1)
        vmaxps  64(%r10,%rax), %zmm15, %zmm1
        vfnmadd132ps    64(%r14,%rax), %zmm12, %zmm14
        vpermps -128(%rcx), %zmm9, %zmm2                   (2)
        cmpl    %esi, %r11d
        vmovups %zmm13, (%r9,%rax)
        leaq    -256(%rcx), %rcx
        vmaxps  %zmm1, %zmm14, %zmm13
        vmovaps %zmm2, %zmm14                              (1)
        vmaxps  128(%r10,%rax), %zmm15, %zmm1
        vfnmadd132ps    128(%r14,%rax), %zmm12, %zmm14
        vpermps 64(%rcx), %zmm9, %zmm2                     (2)
        vmovups %zmm13, 64(%r9,%rax)
        vmaxps  %zmm1, %zmm14, %zmm13
        vmovaps %zmm2, %zmm14                              (1)
        vmaxps  192(%r10,%rax), %zmm15, %zmm1
        vfnmadd132ps    192(%r14,%rax), %zmm12, %zmm14
        vmovups %zmm13, 128(%r9,%rax)
        vmaxps  %zmm1, %zmm14, %zmm13
        vmovups %zmm13, 192(%r9,%rax)
        leaq    256(%rax), %rax
        ja      .L26

It is better visible without -funroll-loops:

.L26:
        vpermps (%rcx), %zmm10, %zmm1                      (2)
        leal    1(%rsi), %esi
        vmovaps %zmm1, %zmm2                               (1)
        vmaxps  (%r15,%rdx), %zmm3, %zmm1
        vfnmadd132ps    (%r12,%rdx), %zmm7, %zmm2
        cmpl    %esi, %r8d
        leaq    -64(%rcx), %rcx
        vmaxps  %zmm1, %zmm2, %zmm1
        vmovups %zmm1, (%rdi,%rdx)
        leaq    64(%rdx), %rdx
        ja      .L26

Reply via email to