Re: [Qemu-devel] [PATCH] fpu/softfloat: use hardware sqrt if we can (EXPERIMENT!)

2018-03-21 Thread Emilio G. Cota
On Tue, Feb 20, 2018 at 21:01:37 +, Alex Bennée wrote:
> This is an attempt to save some of the cost of sqrt by using the
> inbuilt support of the host hardware. The idea is assuming we start
> with a valid input we can use the hardware. If any tininess issues
> occur this will trip and FPU exception where:
> 
>   - we turn off cpu->use_host_fpu
>   - mask the FPU exceptions
>   - return to what we were doing

Please see this thread for an alternative approach:

  [Qemu-devel] [PATCH v1 00/14] fp-test + hostfloat
  https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg05908.html

Emilio



Re: [Qemu-devel] [PATCH] fpu/softfloat: use hardware sqrt if we can (EXPERIMENT!)

2018-02-21 Thread Alex Bennée

Alex Bennée  writes:

> This is an attempt to save some of the cost of sqrt by using the
> inbuilt support of the host hardware. The idea is assuming we start
> with a valid input we can use the hardware. If any tininess issues
> occur this will trip and FPU exception where:
>
>   - we turn off cpu->use_host_fpu
>   - mask the FPU exceptions
>   - return to what we were doing
>
> Once we return we should pick up the fact that there was something
> weird about the operation and fall-back to the pure software
> implementation.
>
> You could imagine this being extended for code generation but instead
> of returning to the code we could exit and re-generate the TB but this
> time with pure software helpers rather than any support from the
> hardware.
>
> This is a sort of fix-it-up after the fact approach because reading
> the FP state is an expensive operation for everything so let's only
> worry about exceptions when they trip...
>

> --- a/linux-user/signal.c
> +++ b/linux-user/signal.c
> @@ -20,6 +20,7 @@
>  #include "qemu/bitops.h"
>  #include 
>  #include 
> +#include 
>
>  #include "qemu.h"
>  #include "qemu-common.h"
> @@ -639,6 +640,21 @@ static void host_signal_handler(int host_signum, 
> siginfo_t *info,
>  ucontext_t *uc = puc;
>  struct emulated_sigtable *k;
>
> +/* Catch any FPU exceptions we might get from having tried to use
> + * the host FPU to speed up some calculations
> + */
> +if (host_signum == SIGFPE && cpu->use_host_fpu) {
> +cpu->use_host_fpu = false;
> +/* sadly this gets lost on the context switch when we return */
> +fedisableexcept(FE_INVALID   |
> +FE_OVERFLOW  |
> +FE_UNDERFLOW |
> +FE_INEXACT);
> +/* sigaddset(>uc_sigmask, SIGFPE); */
> +uc->__fpregs_mem.mxcsr |= 0x1f80;

This is a bug, the correct place to reset mxcsr for the return is:

(uc->uc_mcontext.fpregs)->mxcsr |= 0x1f80;

--
Alex Bennée



[Qemu-devel] [PATCH] fpu/softfloat: use hardware sqrt if we can (EXPERIMENT!)

2018-02-20 Thread Alex Bennée
This is an attempt to save some of the cost of sqrt by using the
inbuilt support of the host hardware. The idea is assuming we start
with a valid input we can use the hardware. If any tininess issues
occur this will trip and FPU exception where:

  - we turn off cpu->use_host_fpu
  - mask the FPU exceptions
  - return to what we were doing

Once we return we should pick up the fact that there was something
weird about the operation and fall-back to the pure software
implementation.

You could imagine this being extended for code generation but instead
of returning to the code we could exit and re-generate the TB but this
time with pure software helpers rather than any support from the
hardware.

This is a sort of fix-it-up after the fact approach because reading
the FP state is an expensive operation for everything so let's only
worry about exceptions when they trip...

Signed-off-by: Alex Bennée 
---
 cpus.c| 28 
 fpu/softfloat.c   | 40 +++-
 include/fpu/softfloat-types.h |  2 ++
 include/fpu/softfloat.h   |  4 
 include/qom/cpu.h |  1 +
 linux-user/main.c |  8 
 linux-user/signal.c   | 16 
 target/arm/cpu.c  |  4 
 8 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/cpus.c b/cpus.c
index f298b659f4..e435f6737b 100644
--- a/cpus.c
+++ b/cpus.c
@@ -23,6 +23,7 @@
  */
 
 #include "qemu/osdep.h"
+#include 
 #include "qemu/config-file.h"
 #include "cpu.h"
 #include "monitor/monitor.h"
@@ -1078,10 +1079,36 @@ static void qemu_init_sigbus(void)
 
 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 }
+
+static void sigfpu_handler(int n, siginfo_t *siginfo, void *ctx)
+{
+fprintf(stderr, "%s: got %d, %p/%p\n", __func__, n, siginfo, ctx);
+
+/* Called asynchronously in VCPU thread.  */
+g_assert(current_cpu);
+}
+
+static void qemu_init_sigfpu(void)
+{
+struct sigaction action;
+
+memset(, 0, sizeof(action));
+action.sa_flags = SA_SIGINFO;
+action.sa_sigaction = sigfpu_handler;
+sigaction(SIGBUS, , NULL);
+
+feenableexcept(FE_INVALID   |
+   FE_OVERFLOW  |
+   FE_UNDERFLOW |
+   FE_INEXACT);
+}
 #else /* !CONFIG_LINUX */
 static void qemu_init_sigbus(void)
 {
 }
+static void qemu_init_sigfpu(void)
+{
+}
 #endif /* !CONFIG_LINUX */
 
 static QemuMutex qemu_global_mutex;
@@ -1827,6 +1854,7 @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
 if (!tcg_region_inited) {
 tcg_region_inited = 1;
 tcg_region_init();
+qemu_init_sigfpu();
 }
 
 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index e7fb0d357a..ec9355af7a 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -1905,10 +1905,12 @@ float64 float64_scalbn(float64 a, int n, float_status 
*status)
  * bits to ensure we get a correctly rounded result.
  *
  * This does mean however the calculation is slower than before,
- * especially for 64 bit floats.
+ * especially for 64 bit floats. However the caller can only do checks
+ * if they actually want to off-load to the library.
  */
 
-static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
+static FloatParts sqrt_float(FloatParts a, float_status *s,
+ const FloatFmt *p, bool check_only)
 {
 uint64_t a_frac, r_frac, s_frac;
 int bit, last_bit;
@@ -1928,6 +1930,10 @@ static FloatParts sqrt_float(FloatParts a, float_status 
*s, const FloatFmt *p)
 return a;  /* sqrt(+inf) = +inf */
 }
 
+if (check_only) {
+return a;
+}
+
 assert(a.cls == float_class_normal);
 
 /* We need two overflow bits at the top. Adding room for that is a
@@ -1973,21 +1979,45 @@ static FloatParts sqrt_float(FloatParts a, float_status 
*s, const FloatFmt *p)
 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
 {
 FloatParts pa = float16_unpack_canonical(a, status);
-FloatParts pr = sqrt_float(pa, status, _params);
+FloatParts pr = sqrt_float(pa, status, _params, false);
 return float16_round_pack_canonical(pr, status);
 }
 
 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
 {
 FloatParts pa = float32_unpack_canonical(a, status);
-FloatParts pr = sqrt_float(pa, status, _params);
+FloatParts pr;
+
+if (status->use_host_fpu && *status->use_host_fpu) {
+pr = sqrt_float(pa, status, _params, true);
+if (pr.cls == float_class_normal) {
+float32 r = __builtin_sqrt(a);
+if (*status->use_host_fpu) {
+return r;
+}
+}
+}
+
+pr = sqrt_float(pa, status, _params, false);
 return float32_round_pack_canonical(pr, status);
 }
 
 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status