The floating point code used in fpu_helper.c can be sped up by using the IEEE 754 support added to the C99 standard. To test this code out simply set and unset the I_NEED_SPEED macro. The program to test out each version of the helper_fmadd() function is below the patch. It needs to be ran in the guest. The emulator to use is qemu-system-ppc. I used a Mac OS X guest, but the test program would compile on a Linux guest.
This patch does make the fused multiply-add instruction fmadd work faster and still give a correct result. This documentation might be of help to those who want to learn more about C99's IEEE 754 support: http://grouper.ieee.org/groups/754/meeting-materials/2001-07-18-c99.pdf --- target-ppc/fpu_helper.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 102 insertions(+), 3 deletions(-) diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c index b0760f0..05eb26e 100644 --- a/target-ppc/fpu_helper.c +++ b/target-ppc/fpu_helper.c @@ -20,10 +20,22 @@ #include "cpu.h" #include "exec/helper-proto.h" #include "exec/exec-all.h" +#include <inttypes.h> +#include <math.h> +#include <fenv.h> #define float64_snan_to_qnan(x) ((x) | 0x0008000000000000ULL) #define float32_snan_to_qnan(x) ((x) | 0x00400000) +#define DEBUG_FPU 0 + +#define DPRINTF(fmt, ...) do { \ + if (DEBUG_FPU) { \ + printf("FPU: " fmt , ## __VA_ARGS__); \ + } \ +} while (0); + + /*****************************************************************************/ /* Floating point operations helpers */ uint64_t helper_float32_to_float64(CPUPPCState *env, uint32_t arg) @@ -281,29 +293,36 @@ static inline void float_inexact_excp(CPUPPCState *env) static inline void fpscr_set_rounding_mode(CPUPPCState *env) { - int rnd_type; + int rnd_type, result = 0; /* Set rounding mode */ switch (fpscr_rn) { case 0: /* Best approximation (round to nearest) */ rnd_type = float_round_nearest_even; + result = fesetround(FE_TONEAREST); break; case 1: /* Smaller magnitude (round toward zero) */ rnd_type = float_round_to_zero; + result = fesetround(FE_TOWARDZERO); break; case 2: /* Round toward +infinite */ rnd_type = float_round_up; + result = fesetround(FE_UPWARD); break; default: case 3: /* Round toward -infinite */ rnd_type = float_round_down; + result = fesetround(FE_DOWNWARD); break; } set_float_rounding_mode(rnd_type, &env->fp_status); + if (result != 0) { + printf("Error: rounding mode was not set\n"); + } } void helper_fpscr_clrbit(CPUPPCState *env, uint32_t bit) @@ -534,6 +553,7 @@ void helper_float_check_status(CPUPPCState *env) void helper_reset_fpstatus(CPUPPCState *env) { set_float_exception_flags(0, &env->fp_status); + feclearexcept(FE_ALL_EXCEPT); } /* fadd - fadd. */ @@ -737,16 +757,94 @@ uint64_t helper_frim(CPUPPCState *env, uint64_t arg) return do_fri(env, arg, float_round_down); } -/* fmadd - fmadd. */ +#define I_NEED_SPEED 1 +#ifdef I_NEED_SPEED + +union Converter { + uint64_t i; + double d; +}; + +typedef union Converter Converter; + +/* fmadd - fmadd. - fast */ uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2, uint64_t arg3) { + DPRINTF("Fast helper_fmadd() called\n"); + Converter farg1, farg2, farg3, result; + + farg1.i = arg1; + farg2.i = arg2; + farg3.i = arg3; + + DPRINTF("farg1.d = %f\n", farg1.d); + DPRINTF("farg2.d = %f\n", farg2.d); + DPRINTF("farg3.d = %f\n", farg3.d); + + /* if signalling NaN operation */ + if (unlikely(float64_is_signaling_nan(farg1.d, &env->fp_status) || + float64_is_signaling_nan(farg2.d, &env->fp_status) || + float64_is_signaling_nan(farg3.d, &env->fp_status))) { + float_invalid_op_excp(env, POWERPC_EXCP_FP_VXSNAN, 1); + } + + result.d = fma(farg1.d, farg2.d, farg3.d); /* fused multiply-add function */ + if (fetestexcept(FE_INEXACT)) { + DPRINTF("FE_INEXACT\n"); + float_inexact_excp(env); + } + if (fetestexcept(FE_INVALID)) { + DPRINTF("FE_INVALID\n"); + + /* 0 * infinity */ + if ((fpclassify(farg1.d) == FP_ZERO) && isinf(farg2.d)) { + result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXIMZ, 1); + } + + /* infinity * 0 */ + else if (isinf(farg1.d) && (fpclassify(farg2.d) == FP_ZERO)) { + result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXIMZ, 1); + } + + /* infinity - infinity */ + else if (isinf(farg1.d * farg2.d) && isinf(farg3.d) && (signbit(farg3.d) != 0)) { + result.i = float_invalid_op_excp(env, POWERPC_EXCP_FP_VXISI, 1); + } + } + if (fetestexcept(FE_OVERFLOW)) { + DPRINTF("FE_OVERFLOW\n"); + float_overflow_excp(env); + } + if (fetestexcept(FE_UNDERFLOW)) { + DPRINTF("FE_UNDERFLOW\n"); + float_underflow_excp(env); + } + + DPRINTF("result.d = %f\n", result.d); + DPRINTF("result.i = 0x%" PRIx64 "\n", result.i); + + return result.i; +} + +#else + +/* fmadd - fmadd. - original */ +uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2, + uint64_t arg3) +{ + DPRINTF("old helper_fmadd() called\n"); + CPU_DoubleU farg1, farg2, farg3; farg1.ll = arg1; farg2.ll = arg2; farg3.ll = arg3; + DPRINTF("farg1.d = %f\n", farg1.d); + DPRINTF("farg2.d = %f\n", farg2.d); + DPRINTF("farg3.d = %f\n", farg3.d); + if (unlikely((float64_is_infinity(farg1.d) && float64_is_zero(farg2.d)) || (float64_is_zero(farg1.d) && float64_is_infinity(farg2.d)))) { /* Multiplication of zero by infinity */ @@ -775,9 +873,10 @@ uint64_t helper_fmadd(CPUPPCState *env, uint64_t arg1, uint64_t arg2, farg1.d = float128_to_float64(ft0_128, &env->fp_status); } } - + DPRINTF("farg1.ll = 0x%" PRIx64 "\n", farg1.ll); return farg1.ll; } +#endif /* fmsub - fmsub. */ uint64_t helper_fmsub(CPUPPCState *env, uint64_t arg1, uint64_t arg2, -- 2.7.2 /********************************************/ /************** Test program ****************/ /********************************************/ #include <stdio.h> #include <time.h> #include <math.h> /* fmadds basically does this frD = frA * frC + frB */ int main (int argc, const char * argv[]) { const int iteration_count = pow(10,6); // You might have to change the second argument double iter, frD, frA, frB, frC, total, answer; clock_t start_time, end_time; frA = 1.0014; frB = 0.00001; frC = 0.000015; total = 0.0; answer = 88351739272997344.0; start_time = clock(); for(iter = 0; iter < iteration_count; iter++) { frA *= 1.00001; frB *= 1.00002; frC *= 1.00003; asm volatile("fmadds %0, %1, %2, %3" : "=f" (frD) : "f" (frA), "f" (frC), "f" (frB)); total = total + frD; } end_time = clock(); printf("Total = %f\n", total); if (total == answer) { printf("Value is correct\n"); } else { printf("Value is incorrect. Answer should be %f\n", answer); } printf("Time elapsed: %0.2f seconds\n", (float)(end_time - start_time) / CLOCKS_PER_SEC); return 0; }