This new implementation of fixunstfdi and fixunstfti gives 16X performance improvement. The design is focused on: - Making sure the end result was a pure leaf function that only needed builtins or inline functions. - Assumed power8 direct register transfer and accessed the IBM long double as int bit field structure. - Understanding the quirks of IBM long double and decompose the code in to a set of optimized sub cases. Tested on powerpc64le.
2018-10-20 Steven Munroe <munroes...@gmail.com> Rajalakshmi Srinivasaraghavan <r...@linux.vnet.ibm.com> * libgcc/config/rs6000/t-ppc64-fp (LIB2ADD): Add $(srcdir)/config/rs6000/fixunstfti.c. * libgcc/config/rs6000/ppc64-fp.c (__fixunstfdi): Remove definition. * libgcc/config/rs6000/fixunstfti.c: New file. * libgcc/config/rs6000/fixunstfdi.c: Likewise. * libgcc/config/rs6000/ibm-ldouble.h: Likewise. --- libgcc/config/rs6000/fixunstfdi.c | 124 ++++++++++++++++++++++++++++ libgcc/config/rs6000/fixunstfti.c | 125 +++++++++++++++++++++++++++++ libgcc/config/rs6000/ibm-ldouble.h | 121 ++++++++++++++++++++++++++++ libgcc/config/rs6000/ppc64-fp.c | 24 ------ libgcc/config/rs6000/t-ppc64-fp | 5 +- 5 files changed, 374 insertions(+), 25 deletions(-) create mode 100755 libgcc/config/rs6000/fixunstfdi.c create mode 100755 libgcc/config/rs6000/fixunstfti.c create mode 100755 libgcc/config/rs6000/ibm-ldouble.h diff --git a/libgcc/config/rs6000/fixunstfdi.c b/libgcc/config/rs6000/fixunstfdi.c new file mode 100755 index 00000000000..1b1a4f280bd --- /dev/null +++ b/libgcc/config/rs6000/fixunstfdi.c @@ -0,0 +1,124 @@ +/* Convert IBM long double to 64bit unsigned integer. + + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file into + combinations with other programs, and to distribute those + combinations without any restriction coming from the use of this + file. (The Lesser General Public License restrictions do apply in + other respects; for example, they cover modification of the file, + and distribution when not linked into a combine executable.) + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined(__powerpc64__) || defined (__64BIT__) || defined(__ppc64__) +#include <stdint.h> +#include "ibm-ldouble.h" + +typedef unsigned int UDItype __attribute__ ((mode (DI))); +typedef float TFtype __attribute__ ((mode (TF))); +extern UDItype __fixunstfdi (TFtype); + +#define TWO53 9007199254740992.0L +#define TWO64 18446744073709551616.0L + +UDItype +__fixunstfdi (TFtype a) +{ + unsigned long result; + unsigned long qi0, qi1; + union ibm_extended_long_double ld; + uint64_t l0, l1; + long exp0, exp1; + const uint64_t two52 = 0x10000000000000; + if (__builtin_unpack_longdouble (a, 0) < TWO53) + { + /* In this case the integer portion is completely contained + within the high double. So use the hardware convert to + integer doubleword, and then extend to int. */ + l1 = __builtin_unpack_longdouble (a, 0); + result = l1; + } + else + { + if (a < TWO64) + { + ld.ld = a; + l0 = two52 | ((uint64_t)ld.d[0].ieee.mantissa0 << 32) + | ld.d[0].ieee.mantissa1; + l1 = two52 | ((uint64_t)ld.d[1].ieee.mantissa0 << 32) + | ld.d[1].ieee.mantissa1; + exp0 = ld.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS; + exp1 = ld.d[1].ieee.exponent - IEEE754_DOUBLE_BIAS; + /* The high double shift is (non-negative) because in this + case we know the value it greater than 2^53 -1. */ + qi0 = l0; + qi0 = qi0 << (exp0 - 52); + /* The low double is tricky because it could be + zero/denormal and have a large negative exponent. */ + if ( exp1 > -1022) + { + /* Need to right justify the integer portion of the + low double. This may be a left or right shift. */ + exp1 = exp1 - 52; + if (exp1 < 0) + { + /* Negative exponent, shift right to truncate. */ + l1 = l1 >> (-exp1); + /* If we shift all the significant bit away, but + the signs differ then the sign bit has + significance. */ + if (l1 == 0) + l1 = ld.d[1].ieee.negative; + qi1 = l1; + } + else + { + /* Non-negative exponent, shift left. */ + qi1 = l1; + qi1 = qi1 << (exp1); + } + /* In this case high dbl must be positive so only have + to check if low dbl is negative. If the signs + differ, then subtract the converted low from the + high as quadwords. Otherwise simply add the + converted quadwords. */ + if (ld.d[1].ieee.negative) + result = qi0 - qi1; + else + result = qi0 + qi1; + } + else + { + /* Here the low double is denormal or zero. So only + the converted high double is significant. */ + result = qi0; + } + } + else + { + /* Overflow case. Convert the high double then replicate + to high/low int. This will generate the overflow + value and sets CVI. */ + l0 = __builtin_unpack_longdouble (a, 0); + result = l0; + } + } + return (result); +} +#endif diff --git a/libgcc/config/rs6000/fixunstfti.c b/libgcc/config/rs6000/fixunstfti.c new file mode 100755 index 00000000000..68a8da9c91b --- /dev/null +++ b/libgcc/config/rs6000/fixunstfti.c @@ -0,0 +1,125 @@ +/* Convert IBM long double to 128bit unsigned integer. + + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file into + combinations with other programs, and to distribute those + combinations without any restriction coming from the use of this + file. (The Lesser General Public License restrictions do apply in + other respects; for example, they cover modification of the file, + and distribution when not linked into a combine executable.) + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined(__powerpc64__) || defined (__64BIT__) || defined(__ppc64__) +#include <stdint.h> +#include "ibm-ldouble.h" + +typedef unsigned int UTItype __attribute__ ((mode (TI))); +typedef float TFtype __attribute__ ((mode (TF))); +extern UTItype __fixunstfti (TFtype); + +#define TWO53 9007199254740992.0L +#define TWO128 340282366920938463463374607431768211456.0L + +UTItype +__fixunstfti (TFtype a) +{ + unsigned __int128 result; + unsigned __int128 qi0, qi1; + union ibm_extended_long_double ld; + uint64_t l0, l1; + long exp0, exp1; + const uint64_t two52 = 0x10000000000000; + if (__builtin_unpack_longdouble (a, 0) < TWO53) + { + /* In this case the integer portion is completely contained + within the high double. So use the hardware convert to + integer doubleword, and then extend to __int128. */ + l1 = __builtin_unpack_longdouble (a, 0); + result = l1; + } + else + { + if (a < TWO128) + { + ld.ld = a; + l0 = two52 | ((uint64_t)ld.d[0].ieee.mantissa0 << 32) + | ld.d[0].ieee.mantissa1; + l1 = two52 | ((uint64_t)ld.d[1].ieee.mantissa0 << 32) + | ld.d[1].ieee.mantissa1; + exp0 = ld.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS; + exp1 = ld.d[1].ieee.exponent - IEEE754_DOUBLE_BIAS; + /* The high double shift is (non-negative) because in this + case we know the value it greater than 2^53 -1. */ + qi0 = l0; + qi0 = qi0 << (exp0 - 52); + /* The low double is tricky because it could be + zero/denormal and have a large negative exponent. */ + if ( exp1 > -1022) + { + /* Need to right justify the integer portion of the + low double. This may be a left or right shift. */ + exp1 = exp1 - 52; + if (exp1 < 0) + { + /* Negative exponent, shift right to truncate. */ + l1 = l1 >> (-exp1); + /* If we shift all the significant bit away, but + the signs differ then the sign bit has + significance. */ + if (l1 == 0) + l1 = ld.d[1].ieee.negative; + qi1 = l1; + } + else + { + /* Non-negative exponent, shift left. */ + qi1 = l1; + qi1 = qi1 << (exp1); + } + /* In this case high dbl must be positive so only have + to check if low dbl is negative. If the signs + differ, then subtract the converted low from the + high as quadwords. Otherwise simply add the + converted quadwords. */ + if (ld.d[1].ieee.negative) + result = qi0 - qi1; + else + result = qi0 + qi1; + } + else + { + /* Here the low double is denormal or zero. So only + the converted high double is significant. */ + result = qi0; + } + } + else + { + /* Overflow case. Convert the high double then replicate + to high/low __int128. This will generate the overflow + value and sets CVI. */ + l0 = __builtin_unpack_longdouble (a, 0); + result = l0; + result = (result << 64) + l0; + } + } + return (result); +} +#endif diff --git a/libgcc/config/rs6000/ibm-ldouble.h b/libgcc/config/rs6000/ibm-ldouble.h new file mode 100755 index 00000000000..2fb89cd252a --- /dev/null +++ b/libgcc/config/rs6000/ibm-ldouble.h @@ -0,0 +1,121 @@ +/* Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <endian.h> +union ieee754_float + { + float f; + + /* This is the IEEE 754 single-precision format. */ + struct + { +#if __BYTE_ORDER == __BIG_ENDIAN + unsigned int negative:1; + unsigned int exponent:8; + unsigned int mantissa:23; +#endif /* Big endian. */ +#if __BYTE_ORDER == __LITTLE_ENDIAN + unsigned int mantissa:23; + unsigned int exponent:8; + unsigned int negative:1; +#endif /* Little endian. */ + } ieee; + + /* This format makes it easier to see if a NaN is a signalling NaN. */ + struct + { +#if __BYTE_ORDER == __BIG_ENDIAN + unsigned int negative:1; + unsigned int exponent:8; + unsigned int quiet_nan:1; + unsigned int mantissa:22; +#endif /* Big endian. */ +#if __BYTE_ORDER == __LITTLE_ENDIAN + unsigned int mantissa:22; + unsigned int quiet_nan:1; + unsigned int exponent:8; + unsigned int negative:1; +#endif /* Little endian. */ + } ieee_nan; + }; + +#define IEEE754_FLOAT_BIAS 0x7f /* Added to exponent. */ + + +union ieee754_double + { + double d; + + /* This is the IEEE 754 double-precision format. */ + struct + { +#if __BYTE_ORDER == __BIG_ENDIAN + unsigned int negative:1; + unsigned int exponent:11; + /* Together these comprise the mantissa. */ + unsigned int mantissa0:20; + unsigned int mantissa1:32; +#endif /* Big endian. */ +#if __BYTE_ORDER == __LITTLE_ENDIAN + /* Together these comprise the mantissa. */ + unsigned int mantissa1:32; + unsigned int mantissa0:20; + unsigned int exponent:11; + unsigned int negative:1; +#endif /* Little endian. */ + } ieee; + + /* This format makes it easier to see if a NaN is a signalling NaN. */ + struct + { +#if __BYTE_ORDER == __BIG_ENDIAN + unsigned int negative:1; + unsigned int exponent:11; + unsigned int quiet_nan:1; + /* Together these comprise the mantissa. */ + unsigned int mantissa0:19; + unsigned int mantissa1:32; +#else + /* Together these comprise the mantissa. */ + unsigned int mantissa1:32; + unsigned int mantissa0:19; + unsigned int quiet_nan:1; + unsigned int exponent:11; + unsigned int negative:1; +#endif + } ieee_nan; + }; + +#define IEEE754_DOUBLE_BIAS 0x3ff /* Added to exponent. */ + + +/* IBM extended format for long double. + + Each long double is made up of two IEEE doubles. The value of the + long double is the sum of the values of the two parts. The most + significant part is required to be the value of the long double + rounded to the nearest double, as specified by IEEE. For Inf + values, the least significant part is required to be one of +0.0 or + -0.0. No other requirements are made; so, for example, 1.0 may be + represented as (1.0, +0.0) or (1.0, -0.0), and the low part of a + NaN is don't-care. */ + +union ibm_extended_long_double + { + long double ld; + union ieee754_double d[2]; + }; diff --git a/libgcc/config/rs6000/ppc64-fp.c b/libgcc/config/rs6000/ppc64-fp.c index faffc82eeda..97921632dcb 100644 --- a/libgcc/config/rs6000/ppc64-fp.c +++ b/libgcc/config/rs6000/ppc64-fp.c @@ -183,30 +183,6 @@ __floatundisf (UDItype u) return (SFtype) f; } -DItype -__fixunstfdi (TFtype a) -{ - if (a < 0) - return 0; - - /* Compute high word of result, as a flonum. */ - const TFtype b = (a / (((UDItype) 1) << (sizeof (SItype) * 8))); - /* Convert that to fixed (but not to DItype!), - and shift it into the high word. */ - UDItype v = (USItype) b; - v <<= (sizeof (SItype) * 8); - /* Remove high part from the TFtype, leaving the low part as flonum. */ - a -= (TFtype) v; - /* Convert that to fixed (but not to DItype!) and add it in. - Sometimes A comes out negative. This is significant, since - A has more bits than a long int does. */ - if (a < 0) - v -= (USItype) (-a); - else - v += (USItype) a; - return v; -} - /* This version is needed to prevent recursion; fixunsdfdi in libgcc calls fixdfdi, which in turn calls calls fixunsdfdi. */ diff --git a/libgcc/config/rs6000/t-ppc64-fp b/libgcc/config/rs6000/t-ppc64-fp index 26d1730bcdb..37b75a931ff 100644 --- a/libgcc/config/rs6000/t-ppc64-fp +++ b/libgcc/config/rs6000/t-ppc64-fp @@ -1,2 +1,5 @@ # Can be used unconditionally, wrapped in __powerpc64__ || __64BIT__ __ppc64__. -LIB2ADD += $(srcdir)/config/rs6000/ppc64-fp.c +LIB2FUNCS_EXCLUDE = _fixunstfdi +LIB2ADD += $(srcdir)/config/rs6000/ppc64-fp.c \ + $(srcdir)/config/rs6000/fixunstfti.c \ + $(srcdir)/config/rs6000/fixunstfdi.c -- 2.18.0