------- Comment #8 from uros at kss-loka dot si 2006-08-17 07:45 ------- Also interesting is, that -march=pentium4 produces following "de-optimized" code, adding a couple more instructions and wasting %eax register:
.L8: leal (%ebx,%ebx), %eax movl 40(%esp), %edx movl (%edx,%eax,2), %edx movl %edx, (%esp) movl 40(%esp), %edx movl 4(%edx,%eax,2), %ecx movapd %xmm2, %xmm1 cmpl %ecx, (%esp) jge .L11 movl (%esp), %edx .L12: Some additiona timing can be shown (gcc-4.2 -O2 -fomit-frame-pointer): -march=pentium4: 0m2.756s -march=pentium4 -fno-ivopts: 0m2.500s -march=pentium4 -fno-ivopts -mfpmath=sse: 0m2.461s -msse2 -fno-ivopts -mfmpath=sse: 0m2.311s In the last case, the generated code is equal to gcc-3.2 generated one: .L8: movl 36(%esp), %edx movapd %xmm2, %xmm1 movl (%edx,%ebx,4), %eax movl 4(%edx,%ebx,4), %ecx cmpl %ecx, %eax jge .L11 movl %eax, %edx .p2align 4,,7 .L12: movl (%edi,%edx,4), %eax movsd (%esi,%eax,8), %xmm0 mulsd (%ebp,%edx,8), %xmm0 addl $1, %edx cmpl %edx, %ecx addsd %xmm0, %xmm1 jg .L12 -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676