This patch fixes incorrect results for `[xv]fnm{add,sub}.{s,d}` instructions when rounding toward {zero, positive, negative}.
According to the LoongArch ISA specification, the result of an instruction like `FNMSUB.D` is computed as: FR[fd] = -FP64_fusedMultiplyAdd(FR[fj], FR[fk], -FR[fa]) Here, `FP64_fusedMultiplyAdd()` performs a fused multiply-add operation compliant with IEEE 754-2008. The negation is applied to the fully rounded result of the fused operation - not to any intermediate value. This behavior is specifiec to LoongArch and differs from other arches, which is why the existing `float_muladd_negate_result` flag does not model it correctly. To address this, I introduce a new flag `float_muladd_negate_rounded_result`, which applies the negation after rounding. This ensures that rounding decisions based on the sign of the result are handled correctly. Reported-by: mengqinggang <mengqingg...@loongson.cn> Signed-off-by: WANG Rui <wang...@loongson.cn> --- v1 -> v2: - Introduce `float_muladd_negate_rounded_result` --- fpu/softfloat.c | 42 ++++++++++++++++--- include/fpu/softfloat.h | 3 +- .../tcg/insn_trans/trans_farith.c.inc | 10 +++-- target/loongarch/tcg/vec_helper.c | 8 ++-- tests/tcg/loongarch64/Makefile.target | 2 + tests/tcg/loongarch64/test_fnmsub.c | 25 +++++++++++ tests/tcg/loongarch64/test_vfnmsub.c | 27 ++++++++++++ 7 files changed, 102 insertions(+), 15 deletions(-) create mode 100644 tests/tcg/loongarch64/test_fnmsub.c create mode 100644 tests/tcg/loongarch64/test_vfnmsub.c diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 34c962d6bd..2691e89a03 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2234,13 +2234,18 @@ float16_muladd_scalbn(float16 a, float16 b, float16 c, int scale, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; + float16 r; float16_unpack_canonical(&pa, a, status); float16_unpack_canonical(&pb, b, status); float16_unpack_canonical(&pc, c, status); pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status); - return float16_round_pack_canonical(pr, status); + r = float16_round_pack_canonical(pr, status); + if (flags & float_muladd_negate_rounded_result) { + r = float16_chs(r); + } + return r; } float16 float16_muladd(float16 a, float16 b, float16 c, @@ -2254,13 +2259,18 @@ float32_muladd_scalbn(float32 a, float32 b, float32 c, int scale, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; + float32 r; float32_unpack_canonical(&pa, a, status); float32_unpack_canonical(&pb, b, status); float32_unpack_canonical(&pc, c, status); pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status); - return float32_round_pack_canonical(pr, status); + r = float32_round_pack_canonical(pr, status); + if (flags & float_muladd_negate_rounded_result) { + r = float32_chs(r); + } + return r; } float64 QEMU_SOFTFLOAT_ATTR @@ -2268,13 +2278,18 @@ float64_muladd_scalbn(float64 a, float64 b, float64 c, int scale, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; + float64 r; float64_unpack_canonical(&pa, a, status); float64_unpack_canonical(&pb, b, status); float64_unpack_canonical(&pc, c, status); pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status); - return float64_round_pack_canonical(pr, status); + r = float64_round_pack_canonical(pr, status); + if (flags & float_muladd_negate_rounded_result) { + r = float64_chs(r); + } + return r; } static bool force_soft_fma; @@ -2422,39 +2437,54 @@ float64 float64r32_muladd(float64 a, float64 b, float64 c, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; + float64 r; float64_unpack_canonical(&pa, a, status); float64_unpack_canonical(&pb, b, status); float64_unpack_canonical(&pc, c, status); pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status); - return float64r32_round_pack_canonical(pr, status); + r = float64r32_round_pack_canonical(pr, status); + if (flags & float_muladd_negate_rounded_result) { + r = float64_chs(r); + } + return r; } bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, int flags, float_status *status) { FloatParts64 pa, pb, pc, *pr; + bfloat16 r; bfloat16_unpack_canonical(&pa, a, status); bfloat16_unpack_canonical(&pb, b, status); bfloat16_unpack_canonical(&pc, c, status); pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status); - return bfloat16_round_pack_canonical(pr, status); + r = bfloat16_round_pack_canonical(pr, status); + if (flags & float_muladd_negate_rounded_result) { + r = bfloat16_chs(r); + } + return r; } float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, int flags, float_status *status) { FloatParts128 pa, pb, pc, *pr; + float128 r; float128_unpack_canonical(&pa, a, status); float128_unpack_canonical(&pb, b, status); float128_unpack_canonical(&pc, c, status); pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status); - return float128_round_pack_canonical(pr, status); + r = float128_round_pack_canonical(pr, status); + if (flags & float_muladd_negate_rounded_result) { + r = float128_chs(r); + } + return r; } /* diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index c18ab2cb60..db7ea2c916 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -129,7 +129,8 @@ enum { float_muladd_negate_c = 1, float_muladd_negate_product = 2, float_muladd_negate_result = 4, - float_muladd_suppress_add_product_zero = 8, + float_muladd_negate_rounded_result = 8, + float_muladd_suppress_add_product_zero = 16, }; /*---------------------------------------------------------------------------- diff --git a/target/loongarch/tcg/insn_trans/trans_farith.c.inc b/target/loongarch/tcg/insn_trans/trans_farith.c.inc index f4a0dea727..68d149647e 100644 --- a/target/loongarch/tcg/insn_trans/trans_farith.c.inc +++ b/target/loongarch/tcg/insn_trans/trans_farith.c.inc @@ -199,9 +199,11 @@ TRANS(fmadd_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, 0) TRANS(fmadd_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, 0) TRANS(fmsub_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, float_muladd_negate_c) TRANS(fmsub_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, float_muladd_negate_c) -TRANS(fnmadd_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, float_muladd_negate_result) -TRANS(fnmadd_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, float_muladd_negate_result) +TRANS(fnmadd_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, + float_muladd_negate_rounded_result) +TRANS(fnmadd_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, + float_muladd_negate_rounded_result) TRANS(fnmsub_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, - float_muladd_negate_c | float_muladd_negate_result) + float_muladd_negate_c | float_muladd_negate_rounded_result) TRANS(fnmsub_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, - float_muladd_negate_c | float_muladd_negate_result) + float_muladd_negate_c | float_muladd_negate_rounded_result) diff --git a/target/loongarch/tcg/vec_helper.c b/target/loongarch/tcg/vec_helper.c index 3faf52cbc4..d20f887afa 100644 --- a/target/loongarch/tcg/vec_helper.c +++ b/target/loongarch/tcg/vec_helper.c @@ -2458,12 +2458,12 @@ DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0) DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0) DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c) DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c) -DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result) -DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result) +DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_rounded_result) +DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_rounded_result) DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd, - float_muladd_negate_c | float_muladd_negate_result) + float_muladd_negate_c | float_muladd_negate_rounded_result) DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd, - float_muladd_negate_c | float_muladd_negate_result) + float_muladd_negate_c | float_muladd_negate_rounded_result) #define DO_2OP_F(NAME, BIT, E, FN) \ void HELPER(NAME)(void *vd, void *vj, \ diff --git a/tests/tcg/loongarch64/Makefile.target b/tests/tcg/loongarch64/Makefile.target index 00030a1026..e3554a500e 100644 --- a/tests/tcg/loongarch64/Makefile.target +++ b/tests/tcg/loongarch64/Makefile.target @@ -16,5 +16,7 @@ LOONGARCH64_TESTS += test_fclass LOONGARCH64_TESTS += test_fpcom LOONGARCH64_TESTS += test_pcadd LOONGARCH64_TESTS += test_fcsr +LOONGARCH64_TESTS += test_fnmsub +LOONGARCH64_TESTS += test_vfnmsub TESTS += $(LOONGARCH64_TESTS) diff --git a/tests/tcg/loongarch64/test_fnmsub.c b/tests/tcg/loongarch64/test_fnmsub.c new file mode 100644 index 0000000000..47fef92cb7 --- /dev/null +++ b/tests/tcg/loongarch64/test_fnmsub.c @@ -0,0 +1,25 @@ +#include <assert.h> +#include <stdint.h> +#include <fenv.h> + +int main() +{ + double x, y, z; + union { + uint64_t i; + double d; + } u; + + x = 0x1.0p256; + y = 0x1.0p256; + z = 0x1.0p-256; + + fesetround(FE_DOWNWARD); + asm("fnmsub.d %[x], %[x], %[y], %[z]\n\t" + :[x]"+f"(x) + :[y]"f"(y), [z]"f"(z)); + + u.d = x; + assert(u.i == 0xdfefffffffffffffUL); + return 0; +} diff --git a/tests/tcg/loongarch64/test_vfnmsub.c b/tests/tcg/loongarch64/test_vfnmsub.c new file mode 100644 index 0000000000..8c332674ae --- /dev/null +++ b/tests/tcg/loongarch64/test_vfnmsub.c @@ -0,0 +1,27 @@ +#include <assert.h> +#include <stdint.h> +#include <fenv.h> + +int main() +{ + uint64_t x, y, z; + + x = 0x4ff0000000000000UL; + y = 0x4ff0000000000000UL; + z = 0x2ff0000000000000UL; + + fesetround(FE_DOWNWARD); + asm("vreplgr2vr.d $vr0, %[x]\n\t" + "vreplgr2vr.d $vr1, %[y]\n\t" + "vreplgr2vr.d $vr2, %[z]\n\t" + "vfnmsub.d $vr0, $vr0, $vr1, $vr2\n\t" + "vpickve2gr.d %[x], $vr0, 0\n\t" + "vpickve2gr.d %[y], $vr0, 1\n\t" + :[x]"+&r"(x), [y]"+&r"(y) + :[z]"r"(z) + :"$f0", "$f1", "$f2"); + + assert(x == 0xdfefffffffffffffUL); + assert(y == 0xdfefffffffffffffUL); + return 0; +} -- 2.49.0