------- Comment #8 from uros at kss-loka dot si  2006-08-17 07:45 -------
Also interesting is, that -march=pentium4 produces following "de-optimized"
code, adding a couple more instructions and wasting %eax register:

.L8:
        leal    (%ebx,%ebx), %eax
        movl    40(%esp), %edx
        movl    (%edx,%eax,2), %edx
        movl    %edx, (%esp)
        movl    40(%esp), %edx
        movl    4(%edx,%eax,2), %ecx
        movapd  %xmm2, %xmm1
        cmpl    %ecx, (%esp)
        jge     .L11
        movl    (%esp), %edx
.L12:

Some additiona timing can be shown (gcc-4.2 -O2 -fomit-frame-pointer): 

-march=pentium4: 0m2.756s
-march=pentium4 -fno-ivopts: 0m2.500s
-march=pentium4 -fno-ivopts -mfpmath=sse: 0m2.461s
-msse2 -fno-ivopts -mfmpath=sse: 0m2.311s

In the last case, the generated code is equal to gcc-3.2 generated one:

.L8:
        movl    36(%esp), %edx
        movapd  %xmm2, %xmm1
        movl    (%edx,%ebx,4), %eax
        movl    4(%edx,%ebx,4), %ecx
        cmpl    %ecx, %eax
        jge     .L11
        movl    %eax, %edx
        .p2align 4,,7
.L12:
        movl    (%edi,%edx,4), %eax
        movsd   (%esi,%eax,8), %xmm0
        mulsd   (%ebp,%edx,8), %xmm0
        addl    $1, %edx
        cmpl    %edx, %ecx
        addsd   %xmm0, %xmm1
        jg      .L12


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676

Reply via email to