This is an attempt to save some of the cost of sqrt by using the
inbuilt support of the host hardware. The idea is assuming we start
with a valid input we can use the hardware. If any tininess issues
occur this will trip and FPU exception where:
- we turn off cpu->use_host_fpu
- mask the FPU exceptions
- return to what we were doing
Once we return we should pick up the fact that there was something
weird about the operation and fall-back to the pure software
implementation.
You could imagine this being extended for code generation but instead
of returning to the code we could exit and re-generate the TB but this
time with pure software helpers rather than any support from the
hardware.
This is a sort of fix-it-up after the fact approach because reading
the FP state is an expensive operation for everything so let's only
worry about exceptions when they trip...
Signed-off-by: Alex Bennée
---
cpus.c| 28
fpu/softfloat.c | 40 +++-
include/fpu/softfloat-types.h | 2 ++
include/fpu/softfloat.h | 4
include/qom/cpu.h | 1 +
linux-user/main.c | 8
linux-user/signal.c | 16
target/arm/cpu.c | 4
8 files changed, 98 insertions(+), 5 deletions(-)
diff --git a/cpus.c b/cpus.c
index f298b659f4..e435f6737b 100644
--- a/cpus.c
+++ b/cpus.c
@@ -23,6 +23,7 @@
*/
#include "qemu/osdep.h"
+#include
#include "qemu/config-file.h"
#include "cpu.h"
#include "monitor/monitor.h"
@@ -1078,10 +1079,36 @@ static void qemu_init_sigbus(void)
prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
}
+
+static void sigfpu_handler(int n, siginfo_t *siginfo, void *ctx)
+{
+fprintf(stderr, "%s: got %d, %p/%p\n", __func__, n, siginfo, ctx);
+
+/* Called asynchronously in VCPU thread. */
+g_assert(current_cpu);
+}
+
+static void qemu_init_sigfpu(void)
+{
+struct sigaction action;
+
+memset(, 0, sizeof(action));
+action.sa_flags = SA_SIGINFO;
+action.sa_sigaction = sigfpu_handler;
+sigaction(SIGBUS, , NULL);
+
+feenableexcept(FE_INVALID |
+ FE_OVERFLOW |
+ FE_UNDERFLOW |
+ FE_INEXACT);
+}
#else /* !CONFIG_LINUX */
static void qemu_init_sigbus(void)
{
}
+static void qemu_init_sigfpu(void)
+{
+}
#endif /* !CONFIG_LINUX */
static QemuMutex qemu_global_mutex;
@@ -1827,6 +1854,7 @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
if (!tcg_region_inited) {
tcg_region_inited = 1;
tcg_region_init();
+qemu_init_sigfpu();
}
if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index e7fb0d357a..ec9355af7a 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -1905,10 +1905,12 @@ float64 float64_scalbn(float64 a, int n, float_status
*status)
* bits to ensure we get a correctly rounded result.
*
* This does mean however the calculation is slower than before,
- * especially for 64 bit floats.
+ * especially for 64 bit floats. However the caller can only do checks
+ * if they actually want to off-load to the library.
*/
-static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
+static FloatParts sqrt_float(FloatParts a, float_status *s,
+ const FloatFmt *p, bool check_only)
{
uint64_t a_frac, r_frac, s_frac;
int bit, last_bit;
@@ -1928,6 +1930,10 @@ static FloatParts sqrt_float(FloatParts a, float_status
*s, const FloatFmt *p)
return a; /* sqrt(+inf) = +inf */
}
+if (check_only) {
+return a;
+}
+
assert(a.cls == float_class_normal);
/* We need two overflow bits at the top. Adding room for that is a
@@ -1973,21 +1979,45 @@ static FloatParts sqrt_float(FloatParts a, float_status
*s, const FloatFmt *p)
float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
{
FloatParts pa = float16_unpack_canonical(a, status);
-FloatParts pr = sqrt_float(pa, status, _params);
+FloatParts pr = sqrt_float(pa, status, _params, false);
return float16_round_pack_canonical(pr, status);
}
float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
{
FloatParts pa = float32_unpack_canonical(a, status);
-FloatParts pr = sqrt_float(pa, status, _params);
+FloatParts pr;
+
+if (status->use_host_fpu && *status->use_host_fpu) {
+pr = sqrt_float(pa, status, _params, true);
+if (pr.cls == float_class_normal) {
+float32 r = __builtin_sqrt(a);
+if (*status->use_host_fpu) {
+return r;
+}
+}
+}
+
+pr = sqrt_float(pa, status, _params, false);
return float32_round_pack_canonical(pr, status);
}
float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status