http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47657
Summary: missed vectorization Product: gcc Version: 4.6.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: joost.vandevond...@pci.uzh.ch the following is not vectorized with gfortran (4.6 / 4.5) gfortran -O3 -ffast-math -ftree-vectorizer-verbose=6 -S -march=native ( -march=amdfam10 -mcx16 -msahf -mpopcnt -mabm ) SUBROUTINE smm_dnn_8_8_8_4_1_2_1(A,B,C) REAL(KIND=8) :: C(8,8), B(8,8), A(8,8) INTEGER ::i,j,l DO j= 1 , 8 , 2 DO l= 1 , 8 , 1 DO i= 1 , 8 , 1 C(i+0,j+0)=C(i+0,j+0)+A(i+0,l+0)*B(l+0,j+0) C(i+0,j+1)=C(i+0,j+1)+A(i+0,l+0)*B(l+0,j+1) ENDDO ENDDO ENDDO END SUBROUTINE while the cray ftn compiler does, yielding about twice the speed. reference asm: 0000000000000000 <smm_dnn_8_8_8_4_1_2_1_>: 0: 53 push %rbx 1: 48 89 7c 24 f8 mov %rdi,-0x8(%rsp) 6: 48 89 74 24 f0 mov %rsi,-0x10(%rsp) b: 48 89 54 24 e8 mov %rdx,-0x18(%rsp) 10: 31 c0 xor %eax,%eax 12: 48 89 d1 mov %rdx,%rcx 15: 49 89 c0 mov %rax,%r8 18: 49 89 c1 mov %rax,%r9 1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 20: 66 0f 10 04 02 movupd (%rdx,%rax,1),%xmm0 25: 66 0f 10 4c 02 40 movupd 0x40(%rdx,%rax,1),%xmm1 2b: 66 0f 10 54 02 10 movupd 0x10(%rdx,%rax,1),%xmm2 31: 66 0f 10 5c 02 50 movupd 0x50(%rdx,%rax,1),%xmm3 37: 66 0f 10 64 02 20 movupd 0x20(%rdx,%rax,1),%xmm4 3d: 66 0f 10 6c 02 60 movupd 0x60(%rdx,%rax,1),%xmm5 43: 66 0f 10 74 02 30 movupd 0x30(%rdx,%rax,1),%xmm6 49: 66 0f 10 7c 02 70 movupd 0x70(%rdx,%rax,1),%xmm7 4f: 45 31 d2 xor %r10d,%r10d 52: 4d 89 d3 mov %r10,%r11 55: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1) 5c: 00 00 00 00 60: 66 46 0f 10 44 1f 30 movupd 0x30(%rdi,%r11,1),%xmm8 67: 4b 8d 1c 02 lea (%r10,%r8,1),%rbx 6b: f2 44 0f 12 4c de 40 movddup 0x40(%rsi,%rbx,8),%xmm9 72: 66 45 0f 28 d1 movapd %xmm9,%xmm10 77: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 7c: 66 41 0f 58 fa addpd %xmm10,%xmm7 81: f2 44 0f 12 14 de movddup (%rsi,%rbx,8),%xmm10 87: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 8c: 66 41 0f 58 f0 addpd %xmm8,%xmm6 91: 66 46 0f 10 44 1f 20 movupd 0x20(%rdi,%r11,1),%xmm8 98: 66 45 0f 28 d9 movapd %xmm9,%xmm11 9d: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 a2: 66 41 0f 58 eb addpd %xmm11,%xmm5 a7: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 ac: 66 41 0f 58 e0 addpd %xmm8,%xmm4 b1: 66 46 0f 10 44 1f 10 movupd 0x10(%rdi,%r11,1),%xmm8 b8: 66 45 0f 28 d9 movapd %xmm9,%xmm11 bd: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 c2: 66 41 0f 58 db addpd %xmm11,%xmm3 c7: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 cc: 66 41 0f 58 d0 addpd %xmm8,%xmm2 d1: 66 46 0f 10 04 1f movupd (%rdi,%r11,1),%xmm8 d7: 66 45 0f 59 c8 mulpd %xmm8,%xmm9 dc: 66 41 0f 58 c9 addpd %xmm9,%xmm1 e1: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 e6: 66 41 0f 58 c2 addpd %xmm10,%xmm0 eb: 49 83 c3 40 add $0x40,%r11 ef: 49 ff c2 inc %r10 f2: 49 83 fa 08 cmp $0x8,%r10 f6: 0f 8c 64 ff ff ff jl 60 <smm_dnn_8_8_8_4_1_2_1_+0x60> fc: f2 0f 11 7c 01 70 movsd %xmm7,0x70(%rcx,%rax,1) 102: 66 0f 17 7c 01 78 movhpd %xmm7,0x78(%rcx,%rax,1) 108: f2 0f 11 74 02 30 movsd %xmm6,0x30(%rdx,%rax,1) 10e: 66 0f 17 74 02 38 movhpd %xmm6,0x38(%rdx,%rax,1) 114: f2 0f 11 6c 02 60 movsd %xmm5,0x60(%rdx,%rax,1) 11a: 66 0f 17 6c 02 68 movhpd %xmm5,0x68(%rdx,%rax,1) 120: f2 0f 11 64 02 20 movsd %xmm4,0x20(%rdx,%rax,1) 126: 66 0f 17 64 02 28 movhpd %xmm4,0x28(%rdx,%rax,1) 12c: f2 0f 11 5c 02 50 movsd %xmm3,0x50(%rdx,%rax,1) 132: 66 0f 17 5c 02 58 movhpd %xmm3,0x58(%rdx,%rax,1) 138: f2 0f 11 54 02 10 movsd %xmm2,0x10(%rdx,%rax,1) 13e: 66 0f 17 54 02 18 movhpd %xmm2,0x18(%rdx,%rax,1) 144: f2 0f 11 4c 02 40 movsd %xmm1,0x40(%rdx,%rax,1) 14a: 66 0f 17 4c 02 48 movhpd %xmm1,0x48(%rdx,%rax,1) 150: f2 0f 11 04 02 movsd %xmm0,(%rdx,%rax,1) 155: 66 0f 17 44 02 08 movhpd %xmm0,0x8(%rdx,%rax,1) 15b: 49 83 c0 10 add $0x10,%r8 15f: 48 83 e8 80 sub $0xffffffffffffff80,%rax 163: 49 ff c1 inc %r9 166: 49 83 f9 04 cmp $0x4,%r9 16a: 0f 8c b0 fe ff ff jl 20 <smm_dnn_8_8_8_4_1_2_1_+0x20> 170: 5b pop %rbx 171: c3 retq 172: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1) 179: 1f 84 00 00 00 00 00