https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64909
Bug ID: 64909 Summary: [4.8/5 regression] Missed vectorization Product: gcc Version: 5.0 Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Hi, the following loop (taken from firefox unicode stuff) unsigned short a[32]; unsigned int b[32]; t() { int i; for (i=0;i<12;i++) b[i]=a[i]; } compiles by clang to: t: # @t .cfi_startproc # BB#0: vpmovzxwd a(%rip), %xmm0 vmovdqa .LCPI0_0(%rip), %xmm1 # xmm1 = [65535,65535,65535,65535] vpand %xmm1, %xmm0, %xmm0 vmovdqa %xmm0, b(%rip) vpmovzxwd a+8(%rip), %xmm0 vpand %xmm1, %xmm0, %xmm0 vmovdqa %xmm0, b+16(%rip) vpmovzxwd a+16(%rip), %xmm0 vpand %xmm1, %xmm0, %xmm0 vmovdqa %xmm0, b+32(%rip) retq GCC 4.7 does: t: .LFB0: .cfi_startproc movzwl a+16(%rip), %eax vmovaps a(%rip), %xmm0 vpmovzxwd %xmm0, %xmm1 vpsrldq $8, %xmm0, %xmm0 vpmovzxwd %xmm0, %xmm0 movl %eax, b+32(%rip) movzwl a+18(%rip), %eax vmovaps %xmm1, b(%rip) vmovaps %xmm0, b+16(%rip) movl %eax, b+36(%rip) movzwl a+20(%rip), %eax movl %eax, b+40(%rip) movzwl a+22(%rip), %eax movl %eax, b+44(%rip) ret while 4.8 and mainline unrolls and keeps it that way: t: .LFB0: .cfi_startproc movzwl a(%rip), %eax movl %eax, b(%rip) movzwl a+2(%rip), %eax movl %eax, b+4(%rip) movzwl a+4(%rip), %eax movl %eax, b+8(%rip) movzwl a+6(%rip), %eax movl %eax, b+12(%rip) movzwl a+8(%rip), %eax movl %eax, b+16(%rip) movzwl a+10(%rip), %eax movl %eax, b+20(%rip) movzwl a+12(%rip), %eax movl %eax, b+24(%rip) movzwl a+14(%rip), %eax movl %eax, b+28(%rip) movzwl a+16(%rip), %eax movl %eax, b+32(%rip) movzwl a+18(%rip), %eax movl %eax, b+36(%rip) movzwl a+20(%rip), %eax movl %eax, b+40(%rip) movzwl a+22(%rip), %eax movl %eax, b+44(%rip) ret