This merges the int_to_float routine and the round_pack_canonical routine into the same function, allowing the FloatParts structure to be decomposed by the compiler.
This results in a 60-75% speedup of the flattened function. Leave the narrower integer inputs to tail-call the int64_t version. Buglink: https://bugs.launchpad.net/qemu/+bug/1892081 Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- fpu/softfloat.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 7b6aee9323..2cbcf5bf10 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2794,7 +2794,8 @@ static FloatParts int_to_float(int64_t a, int scale, float_status *status) return r; } -float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) +float16 QEMU_FLATTEN +int64_to_float16_scalbn(int64_t a, int scale, float_status *status) { FloatParts pa = int_to_float(a, scale, status); return float16_round_pack_canonical(pa, status); @@ -2830,7 +2831,8 @@ float16 int8_to_float16(int8_t a, float_status *status) return int64_to_float16_scalbn(a, 0, status); } -float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) +float32 QEMU_FLATTEN +int64_to_float32_scalbn(int64_t a, int scale, float_status *status) { FloatParts pa = int_to_float(a, scale, status); return float32_round_pack_canonical(pa, status); @@ -2861,7 +2863,8 @@ float32 int16_to_float32(int16_t a, float_status *status) return int64_to_float32_scalbn(a, 0, status); } -float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) +float64 QEMU_FLATTEN +int64_to_float64_scalbn(int64_t a, int scale, float_status *status) { FloatParts pa = int_to_float(a, scale, status); return float64_round_pack_canonical(pa, status); @@ -2897,7 +2900,8 @@ float64 int16_to_float64(int16_t a, float_status *status) * to the bfloat16 format. */ -bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) +bfloat16 QEMU_FLATTEN +int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) { FloatParts pa = int_to_float(a, scale, status); return bfloat16_round_pack_canonical(pa, status); @@ -2959,7 +2963,8 @@ static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) return r; } -float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) +float16 QEMU_FLATTEN +uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) { FloatParts pa = uint_to_float(a, scale, status); return float16_round_pack_canonical(pa, status); @@ -2995,7 +3000,8 @@ float16 uint8_to_float16(uint8_t a, float_status *status) return uint64_to_float16_scalbn(a, 0, status); } -float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) +float32 QEMU_FLATTEN +uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) { FloatParts pa = uint_to_float(a, scale, status); return float32_round_pack_canonical(pa, status); @@ -3026,7 +3032,8 @@ float32 uint16_to_float32(uint16_t a, float_status *status) return uint64_to_float32_scalbn(a, 0, status); } -float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) +float64 QEMU_FLATTEN +uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) { FloatParts pa = uint_to_float(a, scale, status); return float64_round_pack_canonical(pa, status); @@ -3062,7 +3069,8 @@ float64 uint16_to_float64(uint16_t a, float_status *status) * bfloat16 format. */ -bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) +bfloat16 QEMU_FLATTEN +uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) { FloatParts pa = uint_to_float(a, scale, status); return bfloat16_round_pack_canonical(pa, status); -- 2.25.1