The compiler cannot chain more than two additions together. Use inline assembly for 3 or 4 additions.
Signed-off-by: Richard Henderson <[email protected]> --- include/fpu/softfloat-macros.h | 18 ++++++++++++++++-- fpu/softfloat.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h index 95d88d05b8..99fa124e56 100644 --- a/include/fpu/softfloat-macros.h +++ b/include/fpu/softfloat-macros.h @@ -436,6 +436,13 @@ static inline void uint64_t *z2Ptr ) { +#ifdef __x86_64__ + asm("add %5, %2\n\t" + "adc %4, %1\n\t" + "adc %3, %0" + : "=&r"(*z0Ptr), "=&r"(*z1Ptr), "=&r"(*z2Ptr) + : "rm"(b0), "rm"(b1), "rm"(b2), "0"(a0), "1"(a1), "2"(a2)); +#else uint64_t z0, z1, z2; int8_t carry0, carry1; @@ -450,7 +457,7 @@ static inline void *z2Ptr = z2; *z1Ptr = z1; *z0Ptr = z0; - +#endif } /*---------------------------------------------------------------------------- @@ -494,6 +501,13 @@ static inline void uint64_t *z2Ptr ) { +#ifdef __x86_64__ + asm("sub %5, %2\n\t" + "sbb %4, %1\n\t" + "sbb %3, %0" + : "=&r"(*z0Ptr), "=&r"(*z1Ptr), "=&r"(*z2Ptr) + : "rm"(b0), "rm"(b1), "rm"(b2), "0"(a0), "1"(a1), "2"(a2)); +#else uint64_t z0, z1, z2; int8_t borrow0, borrow1; @@ -508,7 +522,7 @@ static inline void *z2Ptr = z2; *z1Ptr = z1; *z0Ptr = z0; - +#endif } /*---------------------------------------------------------------------------- diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 49de31fec2..54d0b210ac 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -7340,6 +7340,15 @@ static inline void shift256RightJamming(UInt256 *p, unsigned count) /* R = A - B */ static void sub256(UInt256 *r, UInt256 *a, UInt256 *b) { +#if defined(__x86_64__) + asm("sub %7, %3\n\t" + "sbb %6, %2\n\t" + "sbb %5, %1\n\t" + "sbb %4, %0" + : "=&r"(r->w[0]), "=&r"(r->w[1]), "=&r"(r->w[2]), "=&r"(r->w[3]) + : "rme"(b->w[0]), "rme"(b->w[1]), "rme"(b->w[2]), "rme"(b->w[3]), + "0"(a->w[0]), "1"(a->w[1]), "2"(a->w[2]), "3"(a->w[3])); +#else bool borrow = false; for (int i = 3; i >= 0; --i) { @@ -7355,11 +7364,21 @@ static void sub256(UInt256 *r, UInt256 *a, UInt256 *b) } r->w[i] = rt; } +#endif } /* A = -A */ static void neg256(UInt256 *a) { +#if defined(__x86_64__) + asm("negq %3\n\t" + "sbb %6, %2\n\t" + "sbb %5, %1\n\t" + "sbb %4, %0" + : "=&r"(a->w[0]), "=&r"(a->w[1]), "=&r"(a->w[2]), "+rm"(a->w[3]) + : "rme"(a->w[0]), "rme"(a->w[1]), "rme"(a->w[2]), + "0"(0), "1"(0), "2"(0)); +#else /* * Recall that -X - 1 = ~X, and that since this is negation, * once we find a non-zero number, all subsequent words will @@ -7388,11 +7407,20 @@ static void neg256(UInt256 *a) a->w[1] = ~a->w[1]; not0: a->w[0] = ~a->w[0]; +#endif } /* A += B */ static void add256(UInt256 *a, UInt256 *b) { +#if defined(__x86_64__) + asm("add %7, %3\n\t" + "adc %6, %2\n\t" + "adc %5, %1\n\t" + "adc %4, %0" + : "+r"(a->w[0]), "+r"(a->w[1]), "+r"(a->w[2]), "+r"(a->w[3]) + : "rme"(b->w[0]), "rme"(b->w[1]), "rme"(b->w[2]), "rme"(b->w[3])); +#else bool carry = false; for (int i = 3; i >= 0; --i) { @@ -7407,6 +7435,7 @@ static void add256(UInt256 *a, UInt256 *b) } a->w[i] = at; } +#endif } float128 float128_muladd(float128 a_f, float128 b_f, float128 c_f, -- 2.25.1
