Re: [PATCH] target/i386: reimplement f2xm1 using floatx80 operations
On 12/06/20 01:45, Joseph Myers wrote: > The x87 f2xm1 emulation is currently based around conversion to > double. This is inherently unsuitable for a good emulation of any > floatx80 operation, even before considering that it is a particularly > naive implementation using double (computing with pow and then > subtracting 1 rather than attempting a better emulation using expm1). > > Reimplement using the soft-float operations, including additions and > multiplications with higher precision where appropriate to limit > accumulation of errors. I considered reusing some of the m68k code > for transcendental operations, but the instructions don't generally > correspond exactly to x87 operations (for example, m68k has 2^x and > e^x - 1, but not 2^x - 1); to avoid possible accumulation of errors > from applying multiple such operations each rounding to floatx80 > precision, I wrote a direct implementation of 2^x - 1 instead. It > would be possible in principle to make the implementation more > efficient by doing the intermediate operations directly with > significands, signs and exponents and not packing / unpacking floatx80 > format for each operation, but that would make it significantly more > complicated and it's not clear that's worthwhile; the m68k emulation > doesn't try to do that. > > A test is included with many randomly generated inputs. The > assumption of the test is that the result in round-to-nearest mode > should always be one of the two closest floating-point numbers to the > mathematical value of 2^x - 1; the implementation aims to do somewhat > better than that (about 70 correct bits before rounding). I haven't > investigated how accurate hardware is. > > Signed-off-by: Joseph Myers > > --- > > This patch depends on at least some of my previous x87 emulation fixes > being present (probably only the ones in the recent pull request; I > don't think it depends on any of the most recent series fixing fprem > and fprem1). I expect to make similar fixes for fyl2xp1, fyl2x and > fpatan. (The other transcendental instructions (fcos, fptan, fsin, > fsincos) may follow, but as a lower priority, as I'm aiming at getting > reasonable glibc test results under QEMU and those trigonometric > instructions - with their documented semantics that they are defined > to do range reduction using a specific 66-bit approximation to pi - > aren't used in glibc.) > > checkpatch.pl has its usual false-positive complaints about hex > floating-point constants in the testcase. It also complains about > lines over 80 columns in that test; I can reformat the test if > desired, but it's not clear line length matters for such a randomly > generated table of test inputs and expected results. > > --- > target/i386/fpu_helper.c | 385 +- > tests/tcg/i386/test-i386-f2xm1.c | 1140 ++ > 2 files changed, 1522 insertions(+), 3 deletions(-) > create mode 100644 tests/tcg/i386/test-i386-f2xm1.c > > diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c > index 0e531e3821..8f34ea9776 100644 > --- a/target/i386/fpu_helper.c > +++ b/target/i386/fpu_helper.c > @@ -25,6 +25,7 @@ > #include "exec/exec-all.h" > #include "exec/cpu_ldst.h" > #include "fpu/softfloat.h" > +#include "fpu/softfloat-macros.h" > > #ifdef CONFIG_SOFTMMU > #include "hw/irq.h" > @@ -836,12 +837,390 @@ void helper_fbst_ST0(CPUX86State *env, target_ulong > ptr) > merge_exception_flags(env, old_flags); > } > > +/* 128-bit significand of log(2). */ > +#define ln2_sig_high 0xb17217f7d1cf79abULL > +#define ln2_sig_low 0xc9e3b39803f2f6afULL > + > +/* > + * Polynomial coefficients for an approximation to (2^x - 1) / x, on > + * the interval [-1/64, 1/64]. > + */ > +#define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL) > +#define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL) > +#define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL) > +#define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL) > +#define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL) > +#define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL) > +#define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL) > +#define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL) > +#define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL) > + > +struct f2xm1_data { > +/* > + * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1 > + * are very close to exact floatx80 values. > + */ > +floatx80 t; > +/* The value of 2^t. */ > +floatx80 exp2; > +/* The value of 2^t - 1. */ > +floatx80 exp2m1; > +}; > + > +static const struct f2xm1_data f2xm1_table[65] = { > +{ make_floatx80(0xbfff, 0x8000ULL), > + make_floatx80(0x3ffe, 0x8000ULL), > + make_floatx80(0xbffe, 0x8000ULL) }, > +{ make_floatx80(0xbffe, 0xf8002e7eULL), > +
[PATCH] target/i386: reimplement f2xm1 using floatx80 operations
The x87 f2xm1 emulation is currently based around conversion to double. This is inherently unsuitable for a good emulation of any floatx80 operation, even before considering that it is a particularly naive implementation using double (computing with pow and then subtracting 1 rather than attempting a better emulation using expm1). Reimplement using the soft-float operations, including additions and multiplications with higher precision where appropriate to limit accumulation of errors. I considered reusing some of the m68k code for transcendental operations, but the instructions don't generally correspond exactly to x87 operations (for example, m68k has 2^x and e^x - 1, but not 2^x - 1); to avoid possible accumulation of errors from applying multiple such operations each rounding to floatx80 precision, I wrote a direct implementation of 2^x - 1 instead. It would be possible in principle to make the implementation more efficient by doing the intermediate operations directly with significands, signs and exponents and not packing / unpacking floatx80 format for each operation, but that would make it significantly more complicated and it's not clear that's worthwhile; the m68k emulation doesn't try to do that. A test is included with many randomly generated inputs. The assumption of the test is that the result in round-to-nearest mode should always be one of the two closest floating-point numbers to the mathematical value of 2^x - 1; the implementation aims to do somewhat better than that (about 70 correct bits before rounding). I haven't investigated how accurate hardware is. Signed-off-by: Joseph Myers --- This patch depends on at least some of my previous x87 emulation fixes being present (probably only the ones in the recent pull request; I don't think it depends on any of the most recent series fixing fprem and fprem1). I expect to make similar fixes for fyl2xp1, fyl2x and fpatan. (The other transcendental instructions (fcos, fptan, fsin, fsincos) may follow, but as a lower priority, as I'm aiming at getting reasonable glibc test results under QEMU and those trigonometric instructions - with their documented semantics that they are defined to do range reduction using a specific 66-bit approximation to pi - aren't used in glibc.) checkpatch.pl has its usual false-positive complaints about hex floating-point constants in the testcase. It also complains about lines over 80 columns in that test; I can reformat the test if desired, but it's not clear line length matters for such a randomly generated table of test inputs and expected results. --- target/i386/fpu_helper.c | 385 +- tests/tcg/i386/test-i386-f2xm1.c | 1140 ++ 2 files changed, 1522 insertions(+), 3 deletions(-) create mode 100644 tests/tcg/i386/test-i386-f2xm1.c diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c index 0e531e3821..8f34ea9776 100644 --- a/target/i386/fpu_helper.c +++ b/target/i386/fpu_helper.c @@ -25,6 +25,7 @@ #include "exec/exec-all.h" #include "exec/cpu_ldst.h" #include "fpu/softfloat.h" +#include "fpu/softfloat-macros.h" #ifdef CONFIG_SOFTMMU #include "hw/irq.h" @@ -836,12 +837,390 @@ void helper_fbst_ST0(CPUX86State *env, target_ulong ptr) merge_exception_flags(env, old_flags); } +/* 128-bit significand of log(2). */ +#define ln2_sig_high 0xb17217f7d1cf79abULL +#define ln2_sig_low 0xc9e3b39803f2f6afULL + +/* + * Polynomial coefficients for an approximation to (2^x - 1) / x, on + * the interval [-1/64, 1/64]. + */ +#define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL) +#define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL) +#define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL) +#define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL) +#define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL) +#define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL) +#define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL) +#define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL) +#define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL) + +struct f2xm1_data { +/* + * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1 + * are very close to exact floatx80 values. + */ +floatx80 t; +/* The value of 2^t. */ +floatx80 exp2; +/* The value of 2^t - 1. */ +floatx80 exp2m1; +}; + +static const struct f2xm1_data f2xm1_table[65] = { +{ make_floatx80(0xbfff, 0x8000ULL), + make_floatx80(0x3ffe, 0x8000ULL), + make_floatx80(0xbffe, 0x8000ULL) }, +{ make_floatx80(0xbffe, 0xf8002e7eULL), + make_floatx80(0x3ffe, 0x82cd8698ac2b9160ULL), + make_floatx80(0xbffd, 0xfa64f2cea7a8dd40ULL) }, +{ make_floatx80(0xbffe, 0xefffe960ULL), + make_floatx80(0x3ffe, 0x85aac367cc488345ULL), + make_floatx80(0xbffd, 0xf4aa7930676ef976ULL) }, +