http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47657

           Summary: missed vectorization
           Product: gcc
           Version: 4.6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: joost.vandevond...@pci.uzh.ch


the following is not vectorized with gfortran (4.6 / 4.5) 

gfortran -O3 -ffast-math -ftree-vectorizer-verbose=6 -S -march=native 
( -march=amdfam10 -mcx16 -msahf -mpopcnt -mabm )

   SUBROUTINE smm_dnn_8_8_8_4_1_2_1(A,B,C)
      REAL(KIND=8) :: C(8,8), B(8,8), A(8,8)
      INTEGER ::i,j,l
      DO j= 1 , 8 , 2
      DO l= 1 , 8 , 1
      DO i= 1 , 8 , 1
        C(i+0,j+0)=C(i+0,j+0)+A(i+0,l+0)*B(l+0,j+0)
        C(i+0,j+1)=C(i+0,j+1)+A(i+0,l+0)*B(l+0,j+1)
      ENDDO
      ENDDO
      ENDDO
    END SUBROUTINE

while the cray ftn compiler does, yielding about twice the speed.

reference asm:
0000000000000000 <smm_dnn_8_8_8_4_1_2_1_>:
   0:   53                      push   %rbx
   1:   48 89 7c 24 f8          mov    %rdi,-0x8(%rsp)
   6:   48 89 74 24 f0          mov    %rsi,-0x10(%rsp)
   b:   48 89 54 24 e8          mov    %rdx,-0x18(%rsp)
  10:   31 c0                   xor    %eax,%eax
  12:   48 89 d1                mov    %rdx,%rcx
  15:   49 89 c0                mov    %rax,%r8
  18:   49 89 c1                mov    %rax,%r9
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  20:   66 0f 10 04 02          movupd (%rdx,%rax,1),%xmm0
  25:   66 0f 10 4c 02 40       movupd 0x40(%rdx,%rax,1),%xmm1
  2b:   66 0f 10 54 02 10       movupd 0x10(%rdx,%rax,1),%xmm2
  31:   66 0f 10 5c 02 50       movupd 0x50(%rdx,%rax,1),%xmm3
  37:   66 0f 10 64 02 20       movupd 0x20(%rdx,%rax,1),%xmm4
  3d:   66 0f 10 6c 02 60       movupd 0x60(%rdx,%rax,1),%xmm5
  43:   66 0f 10 74 02 30       movupd 0x30(%rdx,%rax,1),%xmm6
  49:   66 0f 10 7c 02 70       movupd 0x70(%rdx,%rax,1),%xmm7
  4f:   45 31 d2                xor    %r10d,%r10d
  52:   4d 89 d3                mov    %r10,%r11
  55:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)
  5c:   00 00 00 00
  60:   66 46 0f 10 44 1f 30    movupd 0x30(%rdi,%r11,1),%xmm8
  67:   4b 8d 1c 02             lea    (%r10,%r8,1),%rbx
  6b:   f2 44 0f 12 4c de 40    movddup 0x40(%rsi,%rbx,8),%xmm9
  72:   66 45 0f 28 d1          movapd %xmm9,%xmm10
  77:   66 45 0f 59 d0          mulpd  %xmm8,%xmm10
  7c:   66 41 0f 58 fa          addpd  %xmm10,%xmm7
  81:   f2 44 0f 12 14 de       movddup (%rsi,%rbx,8),%xmm10
  87:   66 45 0f 59 c2          mulpd  %xmm10,%xmm8
  8c:   66 41 0f 58 f0          addpd  %xmm8,%xmm6
  91:   66 46 0f 10 44 1f 20    movupd 0x20(%rdi,%r11,1),%xmm8
  98:   66 45 0f 28 d9          movapd %xmm9,%xmm11
  9d:   66 45 0f 59 d8          mulpd  %xmm8,%xmm11
  a2:   66 41 0f 58 eb          addpd  %xmm11,%xmm5
  a7:   66 45 0f 59 c2          mulpd  %xmm10,%xmm8
  ac:   66 41 0f 58 e0          addpd  %xmm8,%xmm4
  b1:   66 46 0f 10 44 1f 10    movupd 0x10(%rdi,%r11,1),%xmm8
  b8:   66 45 0f 28 d9          movapd %xmm9,%xmm11
  bd:   66 45 0f 59 d8          mulpd  %xmm8,%xmm11
  c2:   66 41 0f 58 db          addpd  %xmm11,%xmm3
  c7:   66 45 0f 59 c2          mulpd  %xmm10,%xmm8
  cc:   66 41 0f 58 d0          addpd  %xmm8,%xmm2
  d1:   66 46 0f 10 04 1f       movupd (%rdi,%r11,1),%xmm8
  d7:   66 45 0f 59 c8          mulpd  %xmm8,%xmm9
  dc:   66 41 0f 58 c9          addpd  %xmm9,%xmm1
  e1:   66 45 0f 59 d0          mulpd  %xmm8,%xmm10
  e6:   66 41 0f 58 c2          addpd  %xmm10,%xmm0
  eb:   49 83 c3 40             add    $0x40,%r11
  ef:   49 ff c2                inc    %r10
  f2:   49 83 fa 08             cmp    $0x8,%r10
  f6:   0f 8c 64 ff ff ff       jl     60 <smm_dnn_8_8_8_4_1_2_1_+0x60>
  fc:   f2 0f 11 7c 01 70       movsd  %xmm7,0x70(%rcx,%rax,1)
 102:   66 0f 17 7c 01 78       movhpd %xmm7,0x78(%rcx,%rax,1)
 108:   f2 0f 11 74 02 30       movsd  %xmm6,0x30(%rdx,%rax,1)
 10e:   66 0f 17 74 02 38       movhpd %xmm6,0x38(%rdx,%rax,1)
 114:   f2 0f 11 6c 02 60       movsd  %xmm5,0x60(%rdx,%rax,1)
 11a:   66 0f 17 6c 02 68       movhpd %xmm5,0x68(%rdx,%rax,1)
 120:   f2 0f 11 64 02 20       movsd  %xmm4,0x20(%rdx,%rax,1)
 126:   66 0f 17 64 02 28       movhpd %xmm4,0x28(%rdx,%rax,1)
 12c:   f2 0f 11 5c 02 50       movsd  %xmm3,0x50(%rdx,%rax,1)
 132:   66 0f 17 5c 02 58       movhpd %xmm3,0x58(%rdx,%rax,1)
 138:   f2 0f 11 54 02 10       movsd  %xmm2,0x10(%rdx,%rax,1)
 13e:   66 0f 17 54 02 18       movhpd %xmm2,0x18(%rdx,%rax,1)
 144:   f2 0f 11 4c 02 40       movsd  %xmm1,0x40(%rdx,%rax,1)
 14a:   66 0f 17 4c 02 48       movhpd %xmm1,0x48(%rdx,%rax,1)
 150:   f2 0f 11 04 02          movsd  %xmm0,(%rdx,%rax,1)
 155:   66 0f 17 44 02 08       movhpd %xmm0,0x8(%rdx,%rax,1)
 15b:   49 83 c0 10             add    $0x10,%r8
 15f:   48 83 e8 80             sub    $0xffffffffffffff80,%rax
 163:   49 ff c1                inc    %r9
 166:   49 83 f9 04             cmp    $0x4,%r9
 16a:   0f 8c b0 fe ff ff       jl     20 <smm_dnn_8_8_8_4_1_2_1_+0x20>
 170:   5b                      pop    %rbx
 171:   c3                      retq
 172:   66 66 66 66 66 2e 0f    nopw   %cs:0x0(%rax,%rax,1)
 179:   1f 84 00 00 00 00 00

Reply via email to