https://issues.dlang.org/show_bug.cgi?id=18627
Iain Buclaw <[email protected]> changed: What |Removed |Added ---------------------------------------------------------------------------- Status|RESOLVED |REOPENED Resolution|FIXED |--- --- Comment #15 from Iain Buclaw <[email protected]> --- Not sure if this should really be marked as resolved/fixed, but anyhow... With the following (lazy) function generator: --- import std.complex : C = Complex; import std.meta : AliasSeq; import std.format : format; static foreach (T; AliasSeq!(cfloat, cdouble, creal)) { // Unary operators mixin(format!"%s %s_unary_add(%s a) { return +a; }" (T.stringof, T.stringof, T.stringof)); mixin(format!"%s %s_unary_sub(%s a) { return -a; }" (T.stringof, T.stringof, T.stringof)); // Binary operators mixin(format!"%s %s_binary_add(%s a, %s b) { return a + b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); mixin(format!"%s %s_binary_sub(%s a, %s b) { return a - b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); mixin(format!"%s %s_binary_mul(%s a, %s b) { return a * b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); mixin(format!"%s %s_binary_div(%s a, %s b) { return a / b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); } static foreach (T; AliasSeq!(float, double, real)) { // Unary operators mixin(format!"C!%s std_c%s_unary_add(C!%s a) { return +a; }" (T.stringof, T.stringof, T.stringof)); mixin(format!"C!%s std_c%s_unary_sub(C!%s a) { return -a; }" (T.stringof, T.stringof, T.stringof)); // Binary operators mixin(format!"C!%s std_c%s_binary_add(C!%s a, C!%s b) { return a + b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); mixin(format!"C!%s std_c%s_binary_sub(C!%s a, C!%s b) { return a - b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); mixin(format!"C!%s std_c%s_binary_mul(C!%s a, C!%s b) { return a * b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); mixin(format!"C!%s std_c%s_binary_div(C!%s a, C!%s b) { return a / b; }" (T.stringof, T.stringof, T.stringof, T.stringof)); } --- On x86_64/GDC, the results are: ======================================== cfloat_unary_add: movq %xmm0, -8(%rsp) movss -8(%rsp), %xmm0 movss %xmm0, -16(%rsp) movss -4(%rsp), %xmm0 movss %xmm0, -12(%rsp) movq -16(%rsp), %xmm0 ret --- std_cfloat_unary_add: ret ======================================== cdouble_unary_add: ret --- std_cdouble_unary_add: ret ======================================== creal_unary_add: fldt 8(%rsp) fldt 24(%rsp) fxch %st(1) ret --- std_creal_unary_add: movdqa 8(%rsp), %xmm0 movdqa 24(%rsp), %xmm1 movq %rdi, %rax movaps %xmm0, (%rdi) movaps %xmm1, 16(%rdi) ret ======================================== cfloat_unary_sub: movq %xmm0, -8(%rsp) movss -8(%rsp), %xmm0 movss .LC4(%rip), %xmm2 movaps %xmm0, %xmm1 movss -4(%rsp), %xmm0 xorps %xmm2, %xmm1 xorps %xmm2, %xmm0 movss %xmm1, -16(%rsp) movss %xmm0, -12(%rsp) movq -16(%rsp), %xmm0 ret .LC4: .long -2147483648 .long 0 .long 0 .long 0 --- std_cfloat_unary_sub: movq .LC7(%rip), %xmm1 xorps %xmm1, %xmm0 ret .LC7: .long -2147483648 .long -2147483648 ======================================== cdouble_unary_sub: movq .LC5(%rip), %xmm2 xorpd %xmm2, %xmm1 xorpd %xmm2, %xmm0 ret .LC5: .long 0 .long -2147483648 .long 0 .long 0 --- std_cdouble_unary_sub: movq %xmm0, -24(%rsp) movq %xmm1, -16(%rsp) movapd -24(%rsp), %xmm2 xorpd .LC8(%rip), %xmm2 movaps %xmm2, -24(%rsp) movsd -16(%rsp), %xmm1 movsd -24(%rsp), %xmm0 ret .LC8: .long 0 .long -2147483648 .long 0 .long -2147483648 ======================================== creal_unary_sub: fldt 8(%rsp) fchs fldt 24(%rsp) fchs fxch %st(1) ret --- std_creal_unary_sub: fldt 24(%rsp) movq %rdi, %rax fchs fldt 8(%rsp) fchs fstpt (%rdi) fstpt 16(%rdi) ret ======================================== cfloat_binary_add: movq %xmm0, -8(%rsp) movq %xmm1, -16(%rsp) movss -8(%rsp), %xmm1 movss -16(%rsp), %xmm0 addss %xmm0, %xmm1 movss -12(%rsp), %xmm0 addss -4(%rsp), %xmm0 movss %xmm1, -24(%rsp) movss %xmm0, -20(%rsp) movq -24(%rsp), %xmm0 ret --- std_cfloat_binary_add: addps %xmm1, %xmm0 ret ======================================== cdouble_binary_add: addsd %xmm3, %xmm1 addsd %xmm2, %xmm0 ret --- std_cdouble_binary_add: movq %xmm0, -40(%rsp) movq %xmm1, -32(%rsp) movq %xmm2, -24(%rsp) movq %xmm3, -16(%rsp) movapd -24(%rsp), %xmm4 addpd -40(%rsp), %xmm4 movaps %xmm4, -40(%rsp) movsd -32(%rsp), %xmm1 movsd -40(%rsp), %xmm0 ret ======================================== creal_binary_add: fldt 8(%rsp) fldt 40(%rsp) faddp %st, %st(1) fldt 24(%rsp) fldt 56(%rsp) faddp %st, %st(1) fxch %st(1) ret --- std_creal_binary_add: fldt 24(%rsp) movq %rdi, %rax fldt 56(%rsp) faddp %st, %st(1) fldt 40(%rsp) fldt 8(%rsp) faddp %st, %st(1) fstpt (%rdi) fstpt 16(%rdi) ret ======================================== cfloat_binary_sub: movq %xmm0, -8(%rsp) movss -8(%rsp), %xmm0 movq %xmm1, -16(%rsp) movaps %xmm0, %xmm1 movss -4(%rsp), %xmm0 subss -16(%rsp), %xmm1 subss -12(%rsp), %xmm0 movss %xmm1, -24(%rsp) movss %xmm0, -20(%rsp) movq -24(%rsp), %xmm0 ret --- std_cfloat_binary_sub: subps %xmm1, %xmm0 ret ======================================== cdouble_binary_sub: subsd %xmm3, %xmm1 subsd %xmm2, %xmm0 ret --- std_cdouble_binary_sub: movq %xmm0, -40(%rsp) movq %xmm1, -32(%rsp) movapd -40(%rsp), %xmm4 movq %xmm2, -24(%rsp) movq %xmm3, -16(%rsp) subpd -24(%rsp), %xmm4 movaps %xmm4, -40(%rsp) movsd -32(%rsp), %xmm1 movsd -40(%rsp), %xmm0 ret ======================================== creal_binary_sub: fldt 8(%rsp) fldt 40(%rsp) fsubrp %st, %st(1) fldt 24(%rsp) fldt 56(%rsp) fsubrp %st, %st(1) fxch %st(1) ret --- std_creal_binary_sub: fldt 24(%rsp) movq %rdi, %rax fldt 56(%rsp) fsubrp %st, %st(1) fldt 8(%rsp) fldt 40(%rsp) fsubrp %st, %st(1) fstpt (%rdi) fstpt 16(%rdi) ret ======================================== cfloat_binary_mul: movq %xmm0, -8(%rsp) movss -8(%rsp), %xmm0 movss -4(%rsp), %xmm2 movq %xmm1, -16(%rsp) movss -16(%rsp), %xmm3 movss -12(%rsp), %xmm4 movaps %xmm0, %xmm1 movaps %xmm2, %xmm5 mulss %xmm3, %xmm1 mulss %xmm4, %xmm5 mulss %xmm4, %xmm0 mulss %xmm3, %xmm2 subss %xmm5, %xmm1 addss %xmm2, %xmm0 movss %xmm1, -24(%rsp) movss %xmm0, -20(%rsp) movq -24(%rsp), %xmm0 ret --- std_cfloat_binary_mul: movdqa %xmm0, %xmm2 movaps %xmm1, %xmm0 shufps $0xe5, %xmm1, %xmm1 shufps $0xe0, %xmm0, %xmm0 mulps %xmm2, %xmm0 shufps $0xe1, %xmm2, %xmm2 mulps %xmm1, %xmm2 movaps %xmm0, %xmm1 subps %xmm2, %xmm1 addps %xmm2, %xmm0 movss %xmm1, %xmm0 ret ======================================== cdouble_binary_mul: movapd %xmm0, %xmm4 movapd %xmm1, %xmm5 mulsd %xmm2, %xmm0 mulsd %xmm3, %xmm5 mulsd %xmm3, %xmm4 mulsd %xmm2, %xmm1 subsd %xmm5, %xmm0 addsd %xmm4, %xmm1 ret --- std_cdouble_binary_mul: movq %xmm2, -40(%rsp) movq %xmm3, -32(%rsp) movapd -40(%rsp), %xmm2 movq %xmm1, -16(%rsp) movapd -40(%rsp), %xmm1 movq %xmm0, -24(%rsp) movapd -24(%rsp), %xmm0 unpcklpd %xmm2, %xmm2 mulpd -24(%rsp), %xmm2 unpckhpd %xmm1, %xmm1 shufpd $1, %xmm0, %xmm0 mulpd %xmm1, %xmm0 movapd %xmm2, %xmm1 subpd %xmm0, %xmm1 addpd %xmm0, %xmm2 movsd %xmm1, %xmm2 movaps %xmm2, -40(%rsp) movsd -32(%rsp), %xmm1 movsd -40(%rsp), %xmm0 ret ======================================== creal_binary_mul: fldt 8(%rsp) fldt 24(%rsp) fldt 40(%rsp) fldt 56(%rsp) fld %st(3) fmul %st(2), %st fld %st(3) fmul %st(2), %st fsubrp %st, %st(1) fxch %st(4) fmulp %st, %st(1) fxch %st(2) fmulp %st, %st(1) faddp %st, %st(1) fxch %st(1) ret --- std_creal_binary_mul: fldt 40(%rsp) movq %rdi, %rax fldt 56(%rsp) fldt 24(%rsp) fldt 8(%rsp) fld %st(3) fmul %st(1), %st fld %st(2) fmul %st(4), %st fsubrp %st, %st(1) fstpt (%rdi) fxch %st(3) fmulp %st, %st(1) fxch %st(2) fmulp %st, %st(1) faddp %st, %st(1) fstpt 16(%rdi) ret ======================================== cfloat_binary_div: movq %xmm1, -16(%rsp) movss -16(%rsp), %xmm5 movss -12(%rsp), %xmm4 movq %xmm0, -8(%rsp) movss -8(%rsp), %xmm3 movss -4(%rsp), %xmm0 movaps %xmm5, %xmm2 movaps %xmm4, %xmm1 mulss %xmm4, %xmm1 movaps %xmm0, %xmm6 mulss %xmm5, %xmm2 mulss %xmm4, %xmm6 mulss %xmm5, %xmm0 addss %xmm1, %xmm2 movaps %xmm3, %xmm1 mulss %xmm5, %xmm1 mulss %xmm4, %xmm3 addss %xmm6, %xmm1 subss %xmm3, %xmm0 divss %xmm2, %xmm1 divss %xmm2, %xmm0 movss %xmm1, -24(%rsp) movss %xmm0, -20(%rsp) movq -24(%rsp), %xmm0 ret --- std_cfloat_binary_div: movq %xmm1, %rax movdqa %xmm1, %xmm2 movdqa %xmm0, %xmm3 shrq $32, %rax movaps %xmm2, %xmm4 mulss %xmm2, %xmm4 movd %eax, %xmm1 movq %xmm0, %rax movaps %xmm1, %xmm0 shrq $32, %rax mulss %xmm1, %xmm0 movq %rax, %xmm5 movd %eax, %xmm6 mulss %xmm1, %xmm6 addss %xmm0, %xmm4 movaps %xmm2, %xmm0 mulss %xmm3, %xmm0 mulss %xmm5, %xmm2 mulss %xmm1, %xmm3 addss %xmm6, %xmm0 subss %xmm3, %xmm2 divss %xmm4, %xmm0 divss %xmm4, %xmm2 unpcklps %xmm2, %xmm0 ret ======================================== cdouble_binary_div: movapd %xmm0, %xmm4 movapd %xmm2, %xmm5 movapd %xmm3, %xmm0 mulsd %xmm3, %xmm0 movapd %xmm1, %xmm6 mulsd %xmm2, %xmm5 mulsd %xmm3, %xmm6 mulsd %xmm2, %xmm1 addsd %xmm0, %xmm5 movapd %xmm4, %xmm0 mulsd %xmm2, %xmm0 mulsd %xmm3, %xmm4 addsd %xmm6, %xmm0 subsd %xmm4, %xmm1 divsd %xmm5, %xmm0 divsd %xmm5, %xmm1 ret --- std_cdouble_binary_div: movq %xmm2, -40(%rsp) movsd -40(%rsp), %xmm2 movq %xmm3, -32(%rsp) movapd -40(%rsp), %xmm3 movsd -32(%rsp), %xmm4 movq %xmm1, -16(%rsp) movapd -40(%rsp), %xmm1 mulsd %xmm2, %xmm2 movq %xmm0, -24(%rsp) mulsd %xmm4, %xmm4 unpcklpd %xmm3, %xmm3 movapd -24(%rsp), %xmm0 mulpd -24(%rsp), %xmm3 unpckhpd %xmm1, %xmm1 shufpd $1, %xmm0, %xmm0 mulpd %xmm1, %xmm0 addsd %xmm4, %xmm2 movapd %xmm3, %xmm1 addpd %xmm0, %xmm1 subpd %xmm0, %xmm3 unpcklpd %xmm2, %xmm2 movsd %xmm1, %xmm3 divpd %xmm2, %xmm3 movaps %xmm3, -40(%rsp) movsd -32(%rsp), %xmm1 movsd -40(%rsp), %xmm0 ret ======================================== creal_binary_div: fldt 8(%rsp) fldt 24(%rsp) fldt 40(%rsp) fldt 56(%rsp) fld %st(1) fmul %st(2), %st fld %st(1) fmul %st(2), %st faddp %st, %st(1) fld %st(4) fmul %st(3), %st fld %st(4) fmul %st(3), %st faddp %st, %st(1) fdiv %st(1), %st fxch %st(4) fmulp %st, %st(3) fxch %st(4) fmulp %st, %st(1) fsubrp %st, %st(1) fdivp %st, %st(2) ret --- std_creal_binary_div: fldt 40(%rsp) movq %rdi, %rax fldt 56(%rsp) fldt 24(%rsp) fldt 8(%rsp) fld %st(3) fmul %st(4), %st fld %st(3) fmul %st(4), %st faddp %st, %st(1) fld %st(4) fmul %st(2), %st fld %st(3) fmul %st(5), %st faddp %st, %st(1) fdiv %st(1), %st fstpt (%rdi) fxch %st(4) fmulp %st, %st(2) fmulp %st, %st(2) fsubp %st, %st(1) fdivp %st, %st(1) fstpt 16(%rdi) ret ======================================== Just visually comparing: - cfloat -> Complex!float looks to be neglible. - creal -> Complex!real just adds a small overhead of moving data on/off ST registers (this is expected, and not a performance bug). - cdouble -> Complex!double, it may look like cdouble still has a small edge, however the use of *pd instructions on the std.complex would infact make it quicker (i.e: one divpd is 2x faster than two divsd instructions in the cdouble_binary_div functions). I actually found that LLVM seemed for able to pick-up the intent of the FastMath complex divide functions, so LDC might give a more pleasing output. Benchmarks to follow soon... --
