Caligo:

> There shouldn't be a performance difference between the two, but there.

It seems the compiler isn't removing some useless code (the first has 3 groups 
of movsd, the second has 4 of them):

------------

v = v * 1.00000012;
main:

L45:    mov ESI,offset FLAT:_D4test6Vector6__initZ
        lea EDI,068h[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        fld qword ptr 010h[ESP]
        fld qword ptr 018h[ESP]
        fxch    ST1
        fmul    qword ptr FLAT:_DATA[018h]
        lea ESI,068h[ESP]
        lea EDI,048h[ESP]
        fxch    ST1
        fmul    qword ptr FLAT:_DATA[018h]
        fld qword ptr 8[ESP]
        fmul    qword ptr FLAT:_DATA[018h]
        fxch    ST2
        fstp    qword ptr 080h[ESP]
        fxch    ST1
        fld qword ptr 080h[ESP]
        fxch    ST2
        fstp    qword ptr 088h[ESP]
        fxch    ST1
        fld qword ptr 088h[ESP]
        fxch    ST2
        fstp    qword ptr 068h[ESP]
        fstp    qword ptr 070h[ESP]
        fstp    qword ptr 078h[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        lea ESI,048h[ESP]
        lea EDI,8[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        inc EAX
        cmp EAX,03938700h
        jb  L45

-----------------------------

v = 1.00000012 * v;
main:

L45:    mov ESI,offset FLAT:_D4test6Vector6__initZ
        lea EDI,088h[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        fld qword ptr 010h[ESP]
        fld qword ptr 018h[ESP]
        fxch    ST1
        fmul    qword ptr FLAT:_DATA[018h]
        lea ESI,088h[ESP]
        fxch    ST1
        fmul    qword ptr FLAT:_DATA[018h]
        fld qword ptr 8[ESP]
        fxch    ST2
        lea EDI,068h[ESP]
        fxch    ST2
        fmul    qword ptr FLAT:_DATA[018h]
        fxch    ST2
        fstp    qword ptr 0A0h[ESP]
        fxch    ST1
        fld qword ptr 0A0h[ESP]
        fxch    ST2
        fstp    qword ptr 0A8h[ESP]
        fxch    ST1
        fld qword ptr 0A8h[ESP]
        fxch    ST2
        fstp    qword ptr 088h[ESP]
        fstp    qword ptr 090h[ESP]
        fstp    qword ptr 098h[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        lea ESI,068h[ESP]
        lea EDI,048h[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        lea ESI,048h[ESP]
        lea EDI,8[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        inc EAX
        cmp EAX,03938700h
        jb  L45

-----------------

v.x *= 1.00000012; v.y *= 1.00000012; v.z *= 1.00000012;

L42:    fld qword ptr FLAT:_DATA[018h]
        inc EAX
        cmp EAX,03938700h
        fmul    qword ptr 8[ESP]
        fstp    qword ptr 8[ESP]
        fld qword ptr FLAT:_DATA[018h]
        fmul    qword ptr 010h[ESP]
        fstp    qword ptr 010h[ESP]
        fld qword ptr FLAT:_DATA[018h]
        fmul    qword ptr 018h[ESP]
        fstp    qword ptr 018h[ESP]
        jb  L42

-----------------

C GCC uses only 5 instructions/loop, to improve this :

v.x *= 1.00000012; v.y *= 1.00000012; v.z *= 1.00000012;

L2:
    fmul    %st, %st(3)
    subl    $1, %eax
    fmul    %st, %st(2)
    fmul    %st, %st(1)
    jne L2

-----------------

C GCC, -mfpmath=sse -msse3

v.x *= 1.00000012; v.y *= 1.00000012; v.z *= 1.00000012;

L2:
        subl    $1, %eax
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        jne     L2

-----------------

C GCC, -mfpmath=sse -msse3 -funroll-loops

L2:
        subl    $8, %eax
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        mulsd   %xmm0, %xmm1
        mulsd   %xmm0, %xmm2
        mulsd   %xmm0, %xmm3
        jne     L2

I have not found a quick way to let GCC vectorize this code, using two 
multiplications with one SSE instructions, I am not sure GCC is able to do this 
automatically.

Bye,
bearophile

Reply via email to