https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92645
Bug ID: 92645 Summary: Hand written vector code is 450 times slower when compiled with GCC compared to Clang Product: gcc Version: 10.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- Hi, the attached are preprocessed files for Skia where Clang ifdefs was removed so we get roughly same file for GCC and Clang. The internal loop of _ZN3hsw16blit_row_color32EPjPKjij, _ZN3hsw16blit_row_color32EPjPKjij, _ZN3hsw16blit_row_color32EPjPKjij and _ZN3hsw16blit_row_color32EPjPKjij looks a lot worse when compiled by GCC then by clang. I also added flatten to eliminate the inlining difference. Clang has heuristics that makes functions with hand written vector code hot. GCC code packs via stack: 0.43 â mov %ax,0xae(%rsp) 0.03 â movzbl 0x78(%rsp),%eax 0.02 â mov %cx,0xd8(%rsp) 0.02 â mov %ax,0xb0(%rsp) 0.54 â vpextrb $0x9,%xmm5,%eax 0.16 â mov %ax,0xb2(%rsp) 0.51 â vpextrb $0xa,%xmm5,%eax 0.21 â mov %ax,0xb4(%rsp) 0.16 â vpextrb $0xb,%xmm5,%eax 0.46 â mov %ax,0xb6(%rsp) 0.24 â vpextrb $0xc,%xmm5,%eax 0.28 â mov %ax,0xb8(%rsp) 0.41 â vpextrb $0xd,%xmm5,%eax 0.20 â mov %ax,0xba(%rsp) 0.47 â vpextrb $0xe,%xmm5,%eax 0.92 â mov %ax,0xbc(%rsp) 0.72 â vpextrb $0xf,%xmm5,%eax 1.24 â mov %ax,0xbe(%rsp) 10.94 â vmovdqa 0xa0(%rsp),%ymm4 0.02 â mov %cx,0xda(%rsp) 0.00 â mov %cx,0xdc(%rsp) â mov %cx,0xde(%rsp) 10.34 â vpmullw 0xc0(%rsp),%ymm4,%ymm0 2.05 â vpaddw %ymm1,%ymm0,%ymm0 0.50 â vpaddw %ymm3,%ymm0,%ymm0 0.00 â mov %r9,0x58(%rsp) 0.52 â vpsrlw $0x8,%ymm0,%ymm0 0.39 â vpextrw $0x0,%xmm0,%eax 0.69 â mov %al,%r8b 0.17 â vpextrw $0x1,%xmm0,%eax 0.51 â mov %r8,0x50(%rsp) 6.87 â vmovdqa 0x50(%rsp),%xmm5 1.08 â vpinsrb $0x1,%eax,%xmm5,%xmm1 0.00 â vpextrw $0x2,%xmm0,%eax 0.73 â vpinsrb $0x2,%eax,%xmm1,%xmm1 0.02 â vpextrw $0x3,%xmm0,%eax 0.75 â vpinsrb $0x3,%eax,%xmm1,%xmm1 0.10 â vpextrw $0x4,%xmm0,%eax 0.98 â vpinsrb $0x4,%eax,%xmm1,%xmm1 0.16 â vpextrw $0x5,%xmm0,%eax 1.00 â vpinsrb $0x5,%eax,%xmm1,%xmm1 0.22 â vpextrw $0x6,%xmm0,%eax 1.10 â vpinsrb $0x6,%eax,%xmm1,%xmm1 0.30 â vpextrw $0x7,%xmm0,%eax 0.31 â vextracti128 $0x1,%ymm0,%xmm0 0.90 â vpinsrb $0x7,%eax,%xmm1,%xmm6 0.21 â vpextrw $0x0,%xmm0,%eax 0.35 â vmovaps %xmm6,0x50(%rsp) 1.15 â mov 0x58(%rsp),%r9 0.13 â mov 0x50(%rsp),%r8 0.29 â mov %al,%r9b 0.49 â mov %r8,0x50(%rsp) 0.07 â vpextrw $0x1,%xmm0,%eax 0.45 â mov %r9,0x58(%rsp) 7.08 â vmovdqa 0x50(%rsp),%xmm7 1.19 â vpinsrb $0x9,%eax,%xmm7,%xmm1 0.00 â vpextrw $0x2,%xmm0,%eax 0.78 â vpinsrb $0xa,%eax,%xmm1,%xmm1 0.00 â vpextrw $0x3,%xmm0,%eax 0.77 â vpinsrb $0xb,%eax,%xmm1,%xmm1 0.01 â vpextrw $0x4,%xmm0,%eax 0.86 â vpinsrb $0xc,%eax,%xmm1,%xmm1 0.03 â vpextrw $0x5,%xmm0,%eax 0.88 â vpinsrb $0xd,%eax,%xmm1,%xmm1 0.04 â vpextrw $0x6,%xmm0,%eax 0.97 â vpinsrb $0xe,%eax,%xmm1,%xmm1 0.08 â vpextrw $0x7,%xmm0,%eax 1.44 â vpinsrb $0xf,%eax,%xmm1,%xmm0 1.37 â vpextrd $0x1,%xmm0,%eax 0.13 â vinsertps $0xe,%xmm0,%xmm0,%xmm1 0.02 â vmovaps %xmm0,0x50(%rsp) 2.17 â vpinsrd $0x1,%eax,%xmm1,%xmm1 .... Clang code: Percentâ vpmullw %ymm0,%ymm2,%ymm2 â vpaddw %ymm1,%ymm2,%ymm2 â vpsrlw $0x8,%ymm2,%ymm2 â vextracti128 $0x1,%ymm2,%xmm3 â vpackuswb %xmm3,%xmm2,%xmm2 â vmovdqu %xmm2,(%rdi) â add $0x10,%rsi â add $0x10,%rdi â mov %r9d,%eax â cmp $0x4,%r9d â â jae 39179b0 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0xa0> â â jmp 3917a02 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0xf2> â mov %edx,%eax 0.29 â cmp $0x4,%r9d 0.00 â â jb 3917a02 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0xf2> 0.07 â nop 3.95 â vpmovzxbw (%rsi),%ymm2 13.41 â vpmullw %ymm0,%ymm2,%ymm2 13.87 â vpaddw %ymm1,%ymm2,%ymm2 2.93 â vpsrlw $0x8,%ymm2,%ymm2 0.84 â vextracti128 $0x1,%ymm2,%xmm3 9.98 â vpackuswb %xmm3,%xmm2,%xmm2 6.89 â vmovdqu %xmm2,(%rdi) 0.57 â vpmovzxbw 0x10(%rsi),%ymm2 4.02 â vpmullw %ymm0,%ymm2,%ymm2 12.15 â vpaddw %ymm1,%ymm2,%ymm2 2.02 â vpsrlw $0x8,%ymm2,%ymm2 1.22 â vextracti128 $0x1,%ymm2,%xmm3 8.04 â vpackuswb %xmm3,%xmm2,%xmm2 7.09 â vmovdqu %xmm2,0x10(%rdi) 0.19 â add $0x20,%rsi 0.19 â add $0x20,%rdi 0.57 â add $0xfffffff8,%eax 7.26 â cmp $0x3,%eax 0.29 â â jg 39179b0 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0xa0> 0.02 â and $0x3,%edx Percentâ test %edx,%edx 0.04 â â jle 3917af1 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0x1e1> â mov %ecx,%eax â shr $0x18,%eax â shr $0x1f,%ecx â add %eax,%ecx â vmovd %ecx,%xmm0 â vpbroadcastb %xmm0,%xmm0 â mov %r8,%rax â shl $0x20,%rax â or %r8,%rax â vmovq %rax,%xmm1 â vpbroadcastq %xmm1,%xmm1 â vpmovzxbw %xmm1,%ymm1 â vpmovzxbw %xmm0,%ymm0 â vpsllw $0x8,%ymm1,%ymm1 â vpor 0x1f3eb35(%rip),%ymm1,%ymm1 # 5856580 <SkNamedGamut::kAdobeRGB+0x858> â test $0x1,%dl â â jne 3917a5c <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0x14c> â mov %edx,%eax â cmp $0x1,%edx â â jne 3917a90 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0x180> â â jmpq 3917af1 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0x1e1> â lea -0x1(%rdx),%eax â mov (%rsi),%ecx â add $0x4,%rsi â vmovq %rcx,%xmm2 â vpmovzxbw %xmm2,%xmm2 â vpmullw %xmm0,%xmm2,%xmm2 â vpaddw %xmm1,%xmm2,%xmm2 â vpsrlw $0x8,%xmm2,%xmm2 â vpackuswb %xmm0,%xmm2,%xmm2 â vmovq %xmm2,%rcx â mov %ecx,(%rdi) â add $0x4,%rdi â cmp $0x1,%edx â xor %ecx,%ecx â nop â nop â mov (%rsi,%rcx,1),%edx â vmovq %rdx,%xmm2 â vpmovzxbw %xmm2,%xmm2 â vpmullw %xmm0,%xmm2,%xmm2 â vpaddw %xmm1,%xmm2,%xmm2 â vpsrlw $0x8,%xmm2,%xmm2 â vpackuswb %xmm0,%xmm2,%xmm2 â vmovd %xmm2,(%rdi,%rcx,1) â mov 0x4(%rsi,%rcx,1),%edx â vmovq %rdx,%xmm2 â vpmovzxbw %xmm2,%xmm2 â vpmullw %xmm0,%xmm2,%xmm2 â vpaddw %xmm1,%xmm2,%xmm2 â vpsrlw $0x8,%xmm2,%xmm2 â vpackuswb %xmm0,%xmm2,%xmm2 â vmovd %xmm2,0x4(%rdi,%rcx,1) â add $0x8,%rcx â add $0xfffffffe,%eax â â jg 3917aa0 <hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)+0x190> 0.38 â vzeroupper 0.26 â â retq