This patch fixes incorrect results for `[xv]fnm{add,sub}.{s,d}`
instructions when rounding toward {zero, positive, negative}.

According to the LoongArch ISA specification, the result of an
instruction like `FNMSUB.D` is computed as:

  FR[fd] = -FP64_fusedMultiplyAdd(FR[fj], FR[fk], -FR[fa])

Here, `FP64_fusedMultiplyAdd()` performs a fused multiply-add operation
compliant with IEEE 754-2008. The negation is applied to the fully
rounded result of the fused operation - not to any intermediate value.
This behavior is specifiec to LoongArch and differs from other arches,
which is why the existing `float_muladd_negate_result` flag does not
model it correctly.

To address this, I introduce a new flag `float_muladd_negate_rounded_result`,
which applies the negation after rounding. This ensures that rounding
decisions based on the sign of the result are handled correctly.

Reported-by: mengqinggang <mengqingg...@loongson.cn>
Signed-off-by: WANG Rui <wang...@loongson.cn>
---
v1 -> v2:
- Introduce `float_muladd_negate_rounded_result`
---
 fpu/softfloat.c                               | 42 ++++++++++++++++---
 include/fpu/softfloat.h                       |  3 +-
 .../tcg/insn_trans/trans_farith.c.inc         | 10 +++--
 target/loongarch/tcg/vec_helper.c             |  8 ++--
 tests/tcg/loongarch64/Makefile.target         |  2 +
 tests/tcg/loongarch64/test_fnmsub.c           | 25 +++++++++++
 tests/tcg/loongarch64/test_vfnmsub.c          | 27 ++++++++++++
 7 files changed, 102 insertions(+), 15 deletions(-)
 create mode 100644 tests/tcg/loongarch64/test_fnmsub.c
 create mode 100644 tests/tcg/loongarch64/test_vfnmsub.c

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 34c962d6bd..2691e89a03 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -2234,13 +2234,18 @@ float16_muladd_scalbn(float16 a, float16 b, float16 c,
                       int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
+    float16 r;
 
     float16_unpack_canonical(&pa, a, status);
     float16_unpack_canonical(&pb, b, status);
     float16_unpack_canonical(&pc, c, status);
     pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
-    return float16_round_pack_canonical(pr, status);
+    r = float16_round_pack_canonical(pr, status);
+    if (flags & float_muladd_negate_rounded_result) {
+        r = float16_chs(r);
+    }
+    return r;
 }
 
 float16 float16_muladd(float16 a, float16 b, float16 c,
@@ -2254,13 +2259,18 @@ float32_muladd_scalbn(float32 a, float32 b, float32 c,
                       int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
+    float32 r;
 
     float32_unpack_canonical(&pa, a, status);
     float32_unpack_canonical(&pb, b, status);
     float32_unpack_canonical(&pc, c, status);
     pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
-    return float32_round_pack_canonical(pr, status);
+    r = float32_round_pack_canonical(pr, status);
+    if (flags & float_muladd_negate_rounded_result) {
+        r = float32_chs(r);
+    }
+    return r;
 }
 
 float64 QEMU_SOFTFLOAT_ATTR
@@ -2268,13 +2278,18 @@ float64_muladd_scalbn(float64 a, float64 b, float64 c,
                       int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
+    float64 r;
 
     float64_unpack_canonical(&pa, a, status);
     float64_unpack_canonical(&pb, b, status);
     float64_unpack_canonical(&pc, c, status);
     pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
-    return float64_round_pack_canonical(pr, status);
+    r = float64_round_pack_canonical(pr, status);
+    if (flags & float_muladd_negate_rounded_result) {
+        r = float64_chs(r);
+    }
+    return r;
 }
 
 static bool force_soft_fma;
@@ -2422,39 +2437,54 @@ float64 float64r32_muladd(float64 a, float64 b, float64 
c,
                           int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
+    float64 r;
 
     float64_unpack_canonical(&pa, a, status);
     float64_unpack_canonical(&pb, b, status);
     float64_unpack_canonical(&pc, c, status);
     pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
-    return float64r32_round_pack_canonical(pr, status);
+    r = float64r32_round_pack_canonical(pr, status);
+    if (flags & float_muladd_negate_rounded_result) {
+        r = float64_chs(r);
+    }
+    return r;
 }
 
 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
                                       int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
+    bfloat16 r;
 
     bfloat16_unpack_canonical(&pa, a, status);
     bfloat16_unpack_canonical(&pb, b, status);
     bfloat16_unpack_canonical(&pc, c, status);
     pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
-    return bfloat16_round_pack_canonical(pr, status);
+    r = bfloat16_round_pack_canonical(pr, status);
+    if (flags & float_muladd_negate_rounded_result) {
+        r = bfloat16_chs(r);
+    }
+    return r;
 }
 
 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
                                       int flags, float_status *status)
 {
     FloatParts128 pa, pb, pc, *pr;
+    float128 r;
 
     float128_unpack_canonical(&pa, a, status);
     float128_unpack_canonical(&pb, b, status);
     float128_unpack_canonical(&pc, c, status);
     pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
-    return float128_round_pack_canonical(pr, status);
+    r = float128_round_pack_canonical(pr, status);
+    if (flags & float_muladd_negate_rounded_result) {
+        r = float128_chs(r);
+    }
+    return r;
 }
 
 /*
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index c18ab2cb60..db7ea2c916 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -129,7 +129,8 @@ enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
     float_muladd_negate_result = 4,
-    float_muladd_suppress_add_product_zero = 8,
+    float_muladd_negate_rounded_result = 8,
+    float_muladd_suppress_add_product_zero = 16,
 };
 
 /*----------------------------------------------------------------------------
diff --git a/target/loongarch/tcg/insn_trans/trans_farith.c.inc 
b/target/loongarch/tcg/insn_trans/trans_farith.c.inc
index f4a0dea727..68d149647e 100644
--- a/target/loongarch/tcg/insn_trans/trans_farith.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_farith.c.inc
@@ -199,9 +199,11 @@ TRANS(fmadd_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, 0)
 TRANS(fmadd_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, 0)
 TRANS(fmsub_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, float_muladd_negate_c)
 TRANS(fmsub_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, float_muladd_negate_c)
-TRANS(fnmadd_s, FP_SP, gen_muladd, gen_helper_fmuladd_s, 
float_muladd_negate_result)
-TRANS(fnmadd_d, FP_DP, gen_muladd, gen_helper_fmuladd_d, 
float_muladd_negate_result)
+TRANS(fnmadd_s, FP_SP, gen_muladd, gen_helper_fmuladd_s,
+      float_muladd_negate_rounded_result)
+TRANS(fnmadd_d, FP_DP, gen_muladd, gen_helper_fmuladd_d,
+      float_muladd_negate_rounded_result)
 TRANS(fnmsub_s, FP_SP, gen_muladd, gen_helper_fmuladd_s,
-      float_muladd_negate_c | float_muladd_negate_result)
+      float_muladd_negate_c | float_muladd_negate_rounded_result)
 TRANS(fnmsub_d, FP_DP, gen_muladd, gen_helper_fmuladd_d,
-      float_muladd_negate_c | float_muladd_negate_result)
+      float_muladd_negate_c | float_muladd_negate_rounded_result)
diff --git a/target/loongarch/tcg/vec_helper.c 
b/target/loongarch/tcg/vec_helper.c
index 3faf52cbc4..d20f887afa 100644
--- a/target/loongarch/tcg/vec_helper.c
+++ b/target/loongarch/tcg/vec_helper.c
@@ -2458,12 +2458,12 @@ DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0)
 DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0)
 DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c)
 DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c)
-DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result)
-DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result)
+DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_rounded_result)
+DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_rounded_result)
 DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd,
-         float_muladd_negate_c | float_muladd_negate_result)
+         float_muladd_negate_c | float_muladd_negate_rounded_result)
 DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd,
-         float_muladd_negate_c | float_muladd_negate_result)
+         float_muladd_negate_c | float_muladd_negate_rounded_result)
 
 #define DO_2OP_F(NAME, BIT, E, FN)                       \
 void HELPER(NAME)(void *vd, void *vj,                    \
diff --git a/tests/tcg/loongarch64/Makefile.target 
b/tests/tcg/loongarch64/Makefile.target
index 00030a1026..e3554a500e 100644
--- a/tests/tcg/loongarch64/Makefile.target
+++ b/tests/tcg/loongarch64/Makefile.target
@@ -16,5 +16,7 @@ LOONGARCH64_TESTS  += test_fclass
 LOONGARCH64_TESTS  += test_fpcom
 LOONGARCH64_TESTS  += test_pcadd
 LOONGARCH64_TESTS  += test_fcsr
+LOONGARCH64_TESTS  += test_fnmsub
+LOONGARCH64_TESTS  += test_vfnmsub
 
 TESTS += $(LOONGARCH64_TESTS)
diff --git a/tests/tcg/loongarch64/test_fnmsub.c 
b/tests/tcg/loongarch64/test_fnmsub.c
new file mode 100644
index 0000000000..47fef92cb7
--- /dev/null
+++ b/tests/tcg/loongarch64/test_fnmsub.c
@@ -0,0 +1,25 @@
+#include <assert.h>
+#include <stdint.h>
+#include <fenv.h>
+
+int main()
+{
+    double x, y, z;
+    union {
+        uint64_t i;
+        double d;
+    } u;
+
+    x = 0x1.0p256;
+    y = 0x1.0p256;
+    z = 0x1.0p-256;
+
+    fesetround(FE_DOWNWARD);
+    asm("fnmsub.d %[x], %[x], %[y], %[z]\n\t"
+        :[x]"+f"(x)
+        :[y]"f"(y), [z]"f"(z));
+
+    u.d = x;
+    assert(u.i == 0xdfefffffffffffffUL);
+    return 0;
+}
diff --git a/tests/tcg/loongarch64/test_vfnmsub.c 
b/tests/tcg/loongarch64/test_vfnmsub.c
new file mode 100644
index 0000000000..8c332674ae
--- /dev/null
+++ b/tests/tcg/loongarch64/test_vfnmsub.c
@@ -0,0 +1,27 @@
+#include <assert.h>
+#include <stdint.h>
+#include <fenv.h>
+
+int main()
+{
+    uint64_t x, y, z;
+
+    x = 0x4ff0000000000000UL;
+    y = 0x4ff0000000000000UL;
+    z = 0x2ff0000000000000UL;
+
+    fesetround(FE_DOWNWARD);
+    asm("vreplgr2vr.d $vr0, %[x]\n\t"
+        "vreplgr2vr.d $vr1, %[y]\n\t"
+        "vreplgr2vr.d $vr2, %[z]\n\t"
+        "vfnmsub.d $vr0, $vr0, $vr1, $vr2\n\t"
+        "vpickve2gr.d %[x], $vr0, 0\n\t"
+        "vpickve2gr.d %[y], $vr0, 1\n\t"
+        :[x]"+&r"(x), [y]"+&r"(y)
+        :[z]"r"(z)
+        :"$f0", "$f1", "$f2");
+
+    assert(x == 0xdfefffffffffffffUL);
+    assert(y == 0xdfefffffffffffffUL);
+    return 0;
+}
-- 
2.49.0


Reply via email to