On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote:
+#define VSXGER16(NAME, ORIG_T, OR_EL) \
+ void NAME(CPUPPCState *env, uint32_t a_r, uint32_t b_r, \
+ uint32_t at_r, uint32_t mask, uint32_t packed_flags) \
+ { \
+ ppc_vsr_t *at; \
+ float32 psum, aux_acc, va, vb, vc, vd; \
+ int i, j, xmsk_bit, ymsk_bit; \
+ uint8_t xmsk = mask & 0x0F; \
+ uint8_t ymsk = (mask >> 4) & 0x0F; \
+ uint8_t pmsk = (mask >> 8) & 0x3; \
+ ppc_vsr_t *b = cpu_vsr_ptr(env, b_r); \
+ ppc_vsr_t *a = cpu_vsr_ptr(env, a_r); \
+ float_status *excp_ptr = &env->fp_status; \
+ bool acc = ger_acc_flag(packed_flags); \
+ bool neg_acc = ger_neg_acc_flag(packed_flags); \
+ bool neg_mul = ger_neg_mul_flag(packed_flags); \
+ for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) { \
+ at = cpu_vsr_ptr(env, at_r + i); \
+ for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) {\
+ if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) { \
+ va = !(pmsk & 2) ? float32_zero : \
+ GET_VSR(Vsr##OR_EL, a, \
+ 2 * i, ORIG_T, float32); \
+ vb = !(pmsk & 2) ? float32_zero : \
+ GET_VSR(Vsr##OR_EL, b, \
+ 2 * j, ORIG_T, float32); \
+ vc = !(pmsk & 1) ? float32_zero : \
+ GET_VSR(Vsr##OR_EL, a, \
+ 2 * i + 1, ORIG_T, float32);\
+ vd = !(pmsk & 1) ? float32_zero : \
+ GET_VSR(Vsr##OR_EL, b, \
+ 2 * j + 1, ORIG_T, float32);\
+ psum = float32_mul(va, vb, excp_ptr); \
+ psum = float32_muladd(vc, vd, psum, 0, excp_ptr); \
This isn't correct -- the intermediate 'prod' (the first multiply) is not rounded. I
think the correct way to implement this (barring new softfloat functions) is to compute
the intermediate product as float64 with float_round_to_odd, then float64r32_muladd into
the correct rounding mode to finish.
+ if (acc) { \
+ if (neg_mul) { \
+ psum = float32_neg(psum); \
+ } \
+ if (neg_acc) { \
+ aux_acc = float32_neg(at->VsrSF(j)); \
+ } else { \
+ aux_acc = at->VsrSF(j); \
+ } \
+ at->VsrSF(j) = float32_add(psum, aux_acc, \
+ excp_ptr); \
This one, thankfully, uses the rounded intermediate result 'msum', so is ok.
Please do convert this from a macro. Given that float16 and bfloat16 are addressed the
same, I think the only callback you need is the conversion from float16_to_float64. Drop
the bf16 accessor to ppc_vsr_t.
r~