From: Paul Brook <p...@nowt.org> Perpare the horizontal atithmetic vector helpers for AVX These currently use a dummy Reg typed variable to store the result then assign the whole register. This will cause 128 bit operations to corrupt the upper half of the register, so replace it with explicit temporaries and element assignments.
Signed-off-by: Paul Brook <p...@nowt.org> Message-Id: <20220424220204.2493824-18-p...@nowt.org> Reviewed-by: Richard Henderson <richard.hender...@linaro.org> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> --- target/i386/ops_sse.h | 91 ++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 53 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index c9737e16b9..c6dba9572d 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -22,7 +22,6 @@ #if SHIFT == 0 #define Reg MMXReg -#define SIZE 8 #define XMM_ONLY(...) #define B(n) MMX_B(n) #define W(n) MMX_W(n) @@ -31,7 +30,6 @@ #define SUFFIX _mmx #else #define Reg ZMMReg -#define SIZE 16 #define XMM_ONLY(...) __VA_ARGS__ #define B(n) ZMM_B(n) #define W(n) ZMM_W(n) @@ -43,22 +41,6 @@ #define LANE_WIDTH (SHIFT ? 16 : 8) #define PACK_WIDTH (LANE_WIDTH / 2) -/* - * Copy the relevant parts of a Reg value around. In the case where - * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of - * a 64 byte ZMMReg, so we must copy only those and keep the top bytes - * untouched in the guest-visible destination destination register. - * Note that the "lower bytes" are placed last in memory on big-endian - * hosts, which store the vector backwards in memory. In that case the - * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of - * the little-endian case. - */ -#if HOST_BIG_ENDIAN -#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE) -#else -#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE) -#endif - #if SHIFT == 0 #define FPSRL(x, c) ((x) >> shift) #define FPSRAW(x, c) ((int16_t)(x) >> shift) @@ -945,45 +927,49 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); } -void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - MOVE(*d, r); +#define SSE_HELPER_HPS(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + float32 r[2 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ + for (i = j = 0; j < 4; i++, j += 2) { \ + r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ + } \ + for (j = 0; j < 4; i++, j += 2) { \ + r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ + } \ + } \ + for (i = 0; i < 2 << SHIFT; i++) { \ + d->ZMM_S(i) = r[i]; \ + } \ } -void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; +SSE_HELPER_HPS(haddps, float32_add) +SSE_HELPER_HPS(hsubps, float32_sub) - r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - MOVE(*d, r); +#define SSE_HELPER_HPD(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + float64 r[1 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ + for (i = j = 0; j < 2; i++, j += 2) { \ + r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ + } \ + for (j = 0; j < 2; i++, j += 2) { \ + r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ + } \ + } \ + for (i = 0; i < 1 << SHIFT; i++) { \ + d->ZMM_D(i) = r[i]; \ + } \ } -void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - MOVE(*d, r); -} - -void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - MOVE(*d, r); -} +SSE_HELPER_HPD(haddpd, float64_add) +SSE_HELPER_HPD(hsubpd, float64_sub) void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { @@ -2331,4 +2317,3 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #undef L #undef Q #undef SUFFIX -#undef SIZE -- 2.37.2