I wondered how the precision of typical SSE vs FPU code would
compare. SSE uses floats and doubles as the main mean to
control precision, while the FPU is configured via x87 control
word.
The attached program calculates 5^-441 iteratively using these
methods, then converts to double and prints the mantissa:
Compiled with Digital Mars D in 64-bit
float SSE : 00000000000000000000000000000000000000000000000000000
float x87 : 00100000101010100111001110000000000000000000000000000
double SSE: 00100000101010100111001101111011011110011011110010001
double x87: 00100000101010100111001101111011011110011011110010000
real x87 : 00100000101010100111001101111011011110011011110001111
Take-aways:
- SSE results generally differ from x87 results at the same
precision
- x87 produces accurate single precision results, whereas
the error with SSE is fatal (and that's not a flush-to-zero;
the greatest power that still produces 1 bit of mantissa
is 5^-64(!))
--
Marco
//enum loops = 64;
enum loops = 441;
enum X87Precision : ushort
{
_float = 0b00 << 8,
_double = 0b10 << 8,
_real = 0b11 << 8,
}
F div(F)(F x)
{
F sse = 1;
foreach (i; 0 .. loops)
sse /= x;
return sse;
}
real divX87(F)(real x)
{
enum prec = mixin("X87Precision._" ~ F.stringof);
enum mask = ~ushort(0b11 << 8);
ushort cw;
real r;
asm
{
fstcw cw;
and cw, mask;
mov CX, prec;
or cw, CX;
fldcw cw;
fld x;
fld1;
mov ECX, loops;
Loop:
fdiv ST, ST(1);
dec ECX;
jnz Loop;
fstp ST(1);
fstp r;
}
return r;
}
void printMantissa(string title, double x)
{
import std.stdio;
enum hidden = 1L << 52;
long l = *cast(long*)&x;
auto exp = (l >> 52) & 0b11111111111;
auto mant = l & hidden-1;
if (exp) mant |= hidden;
writefln("% -10s: %053b", title, mant);
}
void main()
{
import std.stdio, std.math, std.meta;
writefln("Compiled with %s in %s-bit", __VENDOR__, 8*size_t.sizeof);
real x = 5;
foreach (T; AliasSeq!(float, double, real))
{
double r1 = div!T(x);
printMantissa(T.stringof ~ " SSE", r1);
double r2 = divX87!T(x);
printMantissa(T.stringof ~ " x87", r2);
}
}