With this compiler: gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC)
running the test in http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928 (same .i file, same instructions for reproducing, same compiler options, same everything) gives a time of 132 ms cpu time (132 user, 0 system) with assembly code in the main loop of .L2958: movq %rdx, %rcx addq (%r11), %rcx leaq 4(%rdx), %r14 movq %rcx, (%rdi) addq $4, %rcx movq %rcx, (%r10) movq (%r11), %rcx addq (%rdi), %rcx movq %rcx, (%rsi) addq $4, %rcx movq %rcx, (%r9) movq (%r11), %r12 addq (%rsi), %r12 movq %r12, (%rbp) addq $4, %r12 movq %r12, (%r15) movq (%rax), %rcx addq $7, %rcx movsd (%rcx,%r12,2), %xmm7 movq (%rbp), %r12 leaq (%rcx,%rdx,2), %r13 addq $8, %rdx movsd (%r13), %xmm4 movsd (%rcx,%r12,2), %xmm10 movq (%r9), %r12 movsd (%rcx,%r12,2), %xmm5 movq (%rsi), %r12 movsd (%rcx,%r12,2), %xmm6 movq (%r10), %r12 movsd (%rcx,%r12,2), %xmm13 movq (%rdi), %r12 movsd (%rcx,%r12,2), %xmm11 leaq (%r14,%r14), %r12 movsd (%rcx,%r12), %xmm9 movq 24(%r8), %rcx movapd %xmm11, %xmm14 movsd 15(%rcx), %xmm1 movsd 7(%rcx), %xmm2 movapd %xmm1, %xmm8 movsd 31(%rcx), %xmm3 movapd %xmm2, %xmm12 mulsd %xmm10, %xmm8 mulsd %xmm7, %xmm12 mulsd %xmm2, %xmm10 mulsd %xmm1, %xmm7 movsd 23(%rcx), %xmm0 addsd %xmm8, %xmm12 movapd %xmm2, %xmm8 mulsd %xmm6, %xmm2 subsd %xmm7, %xmm10 movapd %xmm1, %xmm7 mulsd %xmm5, %xmm1 mulsd %xmm6, %xmm7 movapd %xmm4, %xmm6 mulsd %xmm5, %xmm8 movapd %xmm9, %xmm5 subsd %xmm10, %xmm14 subsd %xmm1, %xmm2 movapd %xmm3, %xmm1 addsd %xmm11, %xmm10 xorpd .LC5(%rip), %xmm1 addsd %xmm7, %xmm8 movapd %xmm13, %xmm7 subsd %xmm2, %xmm6 subsd %xmm12, %xmm7 subsd %xmm8, %xmm5 addsd %xmm4, %xmm2 movapd %xmm0, %xmm4 addsd %xmm9, %xmm8 movapd %xmm1, %xmm9 mulsd %xmm14, %xmm4 addsd %xmm13, %xmm12 mulsd %xmm7, %xmm9 mulsd %xmm1, %xmm14 movapd %xmm3, %xmm1 mulsd %xmm0, %xmm7 mulsd %xmm10, %xmm1 mulsd %xmm0, %xmm10 addsd %xmm9, %xmm4 subsd %xmm7, %xmm14 movapd %xmm0, %xmm7 movapd %xmm2, %xmm0 mulsd %xmm12, %xmm7 mulsd %xmm3, %xmm12 addsd %xmm1, %xmm7 subsd %xmm12, %xmm10 addsd %xmm10, %xmm0 subsd %xmm10, %xmm2 movsd %xmm0, (%r13) movapd %xmm8, %xmm0 movq (%rax), %rcx subsd %xmm7, %xmm8 addsd %xmm7, %xmm0 movsd %xmm0, 7(%r12,%rcx) movq (%rdi), %r12 movq (%rax), %rcx movapd %xmm6, %xmm0 subsd %xmm14, %xmm6 movsd %xmm2, 7(%rcx,%r12,2) movq (%r10), %r12 movq (%rax), %rcx addsd %xmm14, %xmm0 movsd %xmm8, 7(%rcx,%r12,2) movq (%rsi), %r12 movq (%rax), %rcx movsd %xmm0, 7(%rcx,%r12,2) movapd %xmm5, %xmm0 movq (%r9), %r12 movq (%rax), %rcx subsd %xmm4, %xmm5 addsd %xmm4, %xmm0 movsd %xmm0, 7(%rcx,%r12,2) movq (%rbp), %r12 movq (%rax), %rcx movsd %xmm6, 7(%rcx,%r12,2) movq (%r15), %r12 movq (%rax), %rcx movsd %xmm5, 7(%rcx,%r12,2) cmpq %rdx, -104(%rsp) jg .L2958 movq %r14, -104(%rsp) With this compiler /pkgs/gcc-mainline/bin/gcc -v Using built-in specs. Target: x86_64-unknown-linux-gnu Configured with: /tmp/lucier/gcc/mainline/configure --enable-checking=release --prefix=/pkgs/gcc-mainline --enable-languages=c --enable-gather-detailed-mem-stats Thread model: posix gcc version 4.4.0 20090313 (experimental) [trunk revision 144829] (GCC) one gets a time of 212 ms cpu time (212 user, 0 system) and the assembly language for the main loop is .L2946: movq %rbx, %rdx addq (%r11), %rdx leaq 4(%rbx), %rbp movq %rdx, (%rsi) addq $4, %rdx movq %rdx, (%r10) movq (%r11), %rdx addq (%rsi), %rdx movq %rdx, (%rcx) addq $4, %rdx movq %rdx, (%r9) movq (%r11), %r13 addq (%rcx), %r13 movq %r13, (%r8) addq $4, %r13 movq %r13, (%r15) movq (%rax), %rdx addq $7, %rdx movsd (%rdx,%r13,2), %xmm0 leaq (%rdx,%rbx,2), %r14 addq $8, %rbx movsd %xmm0, -48(%rsp) movq (%r8), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -56(%rsp) movq (%r9), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -64(%rsp) movq (%rcx), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -72(%rsp) movq (%r10), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -80(%rsp) movq (%rsi), %r13 movsd (%rdx,%r13,2), %xmm0 leaq (%rbp,%rbp), %r13 movsd %xmm0, -104(%rsp) movsd (%rdx,%r13), %xmm0 movsd %xmm0, -88(%rsp) movq 24(%rdi), %rdx movsd 31(%rdx), %xmm0 movsd %xmm0, -32(%rsp) movsd 23(%rdx), %xmm0 movsd %xmm0, -40(%rsp) movsd 15(%rdx), %xmm0 movsd %xmm0, -112(%rsp) movsd 7(%rdx), %xmm0 movsd %xmm0, -120(%rsp) movapd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -48(%rsp), %xmm1 mulsd -56(%rsp), %xmm0 addsd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -48(%rsp), %xmm0 movsd %xmm1, -8(%rsp) movsd -120(%rsp), %xmm1 mulsd -56(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -72(%rsp), %xmm0 movsd %xmm1, -16(%rsp) movsd -120(%rsp), %xmm1 mulsd -64(%rsp), %xmm1 addsd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -64(%rsp), %xmm0 movsd %xmm1, -24(%rsp) movsd -120(%rsp), %xmm1 mulsd -72(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd -80(%rsp), %xmm0 subsd -8(%rsp), %xmm0 movsd %xmm1, -120(%rsp) movsd %xmm0, -48(%rsp) movsd -104(%rsp), %xmm0 subsd -16(%rsp), %xmm0 movsd %xmm0, -112(%rsp) movsd -88(%rsp), %xmm0 subsd -24(%rsp), %xmm0 movsd %xmm0, -56(%rsp) movsd (%r14), %xmm0 subsd %xmm1, %xmm0 movsd %xmm0, -64(%rsp) movsd -80(%rsp), %xmm0 addsd -8(%rsp), %xmm0 movsd %xmm0, -80(%rsp) movsd -104(%rsp), %xmm0 addsd -16(%rsp), %xmm0 movsd %xmm0, -104(%rsp) movsd -88(%rsp), %xmm0 addsd -24(%rsp), %xmm0 movsd %xmm0, -88(%rsp) movsd (%r14), %xmm0 addsd %xmm1, %xmm0 movsd %xmm0, -96(%rsp) movsd -32(%rsp), %xmm0 xorpd .LC5(%rip), %xmm0 movsd %xmm0, -120(%rsp) movapd %xmm0, %xmm1 movsd -40(%rsp), %xmm0 mulsd -48(%rsp), %xmm1 mulsd -112(%rsp), %xmm0 addsd %xmm0, %xmm1 movsd -40(%rsp), %xmm0 mulsd -48(%rsp), %xmm0 movsd %xmm1, -72(%rsp) movsd -120(%rsp), %xmm1 mulsd -112(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd -32(%rsp), %xmm0 mulsd -104(%rsp), %xmm0 movsd %xmm1, -112(%rsp) movsd -40(%rsp), %xmm1 mulsd -80(%rsp), %xmm1 addsd %xmm0, %xmm1 movsd -32(%rsp), %xmm0 mulsd -80(%rsp), %xmm0 movsd %xmm1, -120(%rsp) movsd -40(%rsp), %xmm1 mulsd -104(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd %xmm1, -104(%rsp) movsd -96(%rsp), %xmm0 addsd %xmm1, %xmm0 movsd %xmm0, (%r14) movq (%rax), %rdx movsd -88(%rsp), %xmm0 addsd -120(%rsp), %xmm0 movsd %xmm0, 7(%r13,%rdx) movq (%rsi), %r13 movq (%rax), %rdx movsd -96(%rsp), %xmm0 subsd -104(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r10), %r13 movq (%rax), %rdx movsd -88(%rsp), %xmm0 subsd -120(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%rcx), %r13 movq (%rax), %rdx movsd -64(%rsp), %xmm0 addsd -112(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r9), %r13 movq (%rax), %rdx movsd -56(%rsp), %xmm0 addsd -72(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r8), %r13 movq (%rax), %rdx movsd -64(%rsp), %xmm0 subsd -112(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r15), %r13 movq (%rax), %rdx movsd -56(%rsp), %xmm0 subsd -72(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) cmpq %rbx, (%rsp) jg .L2946 movq %rbp, (%rsp) I'm reporting this separately because it doesn't have the same cause as the previous PR 33928 BTW, with 4.2.4 this test runs in 108 ms on this machine, hence the total regression amount noted in the subject line. This part itself causes about 60% performance regression, the rest is accounte for by http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928 Brad -- Summary: 96% performance regression in floating point code; part of the problem started 2009/03/12-13 Product: gcc Version: 4.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: regression AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: lucier at math dot purdue dot edu GCC build triplet: x86_64-unknown-linux-gnu GCC host triplet: x86_64-unknown-linux-gnu GCC target triplet: x86_64-unknown-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39914