Use softfloat-parts.h so that we can more naturally perform the required operations witha single rounding step. This happens to also simplify the NaN detection step.
Signed-off-by: Richard Henderson <[email protected]> --- target/arm/tcg/vec_helper.c | 77 +++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 91e98d28ae..85bcaac3d1 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -22,6 +22,7 @@ #include "helper.h" #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" +#include "fpu/softfloat-parts.h" #include "qemu/int128.h" #include "crypto/clmul.h" #include "vec_internal.h" @@ -2895,61 +2896,63 @@ float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst, float_status *fpst_odd) { + /* Unpack two BFloat16 into two Float32, trivially. */ float32 s1r = e1 << 16; float32 s1c = e1 & 0xffff0000u; float32 s2r = e2 << 16; float32 s2c = e2 & 0xffff0000u; float32 t32; + /* + * Compare f16_dotadd() in sme_helper.c, but here we have + * bfloat16 inputs. In particular that means that we do not + * want the FPCR.FZ16 flush semantics, so we use the normal + * float_status for the input handling here. + */ + FloatParts64 p1r = float32_unpack_canonical(s1r, fpst); + FloatParts64 p1c = float32_unpack_canonical(s1c, fpst); + FloatParts64 p2r = float32_unpack_canonical(s2r, fpst); + FloatParts64 p2c = float32_unpack_canonical(s2c, fpst); + + int all_mask = (float_cmask(p1r.cls) | float_cmask(p1c.cls) | + float_cmask(p1r.cls) | float_cmask(p1c.cls)); + /* C.f. FPProcessNaNs4 */ - if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) || - float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) { - if (float32_is_signaling_nan(s1r, fpst)) { - t32 = s1r; - } else if (float32_is_signaling_nan(s1c, fpst)) { - t32 = s1c; - } else if (float32_is_signaling_nan(s2r, fpst)) { - t32 = s2r; - } else if (float32_is_signaling_nan(s2c, fpst)) { - t32 = s2c; - } else if (float32_is_any_nan(s1r)) { - t32 = s1r; - } else if (float32_is_any_nan(s1c)) { - t32 = s1c; - } else if (float32_is_any_nan(s2r)) { - t32 = s2r; + if (unlikely(all_mask & float_cmask_anynan)) { + if (unlikely(all_mask & float_cmask_snan)) { + if (p1r.cls == float_class_snan) { + t32 = s1r; + } else if (p1c.cls == float_class_snan) { + t32 = s1c; + } else if (p2r.cls == float_class_snan) { + t32 = s2r; + } else { + t32 = s2c; + } } else { - t32 = s2c; + if (p1r.cls == float_class_qnan) { + t32 = s1r; + } else if (p1c.cls == float_class_qnan) { + t32 = s1c; + } else if (p2r.cls == float_class_qnan) { + t32 = s2r; + } else { + t32 = s2c; + } } /* * FPConvertNaN(FPProcessNaN(t32)) will be done as part * of the final addition below. */ } else { - /* - * Compare f16_dotadd() in sme_helper.c, but here we have - * bfloat16 inputs. In particular that means that we do not - * want the FPCR.FZ16 flush semantics, so we use the normal - * float_status for the input handling here. - */ - float64 e1r = float32_to_float64(s1r, fpst); - float64 e1c = float32_to_float64(s1c, fpst); - float64 e2r = float32_to_float64(s2r, fpst); - float64 e2c = float32_to_float64(s2c, fpst); - float64 t64; - /* * The ARM pseudocode function FPDot performs both multiplies - * and the add with a single rounding operation. Emulate this - * by performing the first multiply in round-to-odd, then doing - * the second multiply as fused multiply-add, and rounding to - * float32 all in one step. + * and the add with a single rounding operation. */ - t64 = float64_mul(e1r, e2r, fpst_odd); - t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); + FloatParts64 tmp = parts64_mul(&p1r, &p2r, fpst); + tmp = parts64_muladd(&p1c, &p2c, &tmp, 0, fpst); - /* This conversion is exact, because we've already rounded. */ - t32 = float64_to_float32(t64, fpst); + t32 = float32_round_pack_canonical(&tmp, fpst); } /* The final accumulation step is not fused. */ -- 2.43.0
