Author: Wenju He Date: 2026-01-26T08:11:14+08:00 New Revision: 20c15c7afe97a546d339a5500d3fa273412e2fda
URL: https://github.com/llvm/llvm-project/commit/20c15c7afe97a546d339a5500d3fa273412e2fda DIFF: https://github.com/llvm/llvm-project/commit/20c15c7afe97a546d339a5500d3fa273412e2fda.diff LOG: [libclc] replace float remquo with amd ocml implementation (#177131) Current implementation has two issues: * unconditionally soft flushes denormal. * can't pass OpenCL CTS test "test_bruteforce remquo" on intel gpu. This PR upstreams remquo implementation from https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs/ocml/src/remainderF_base.h It supports denormal and can pass OpenCL CTS test. Number of LLVM IR instructions of function _Z6remquoffPU3AS5i increased from 96 to 680. --------- Co-authored-by: Copilot <[email protected]> Added: Modified: libclc/clc/lib/generic/math/clc_remquo.cl libclc/clc/lib/generic/math/clc_remquo.inc Removed: ################################################################################ diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl index fd83ead06d89a..db7ab8ceaa073 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.cl +++ b/libclc/clc/lib/generic/math/clc_remquo.cl @@ -7,14 +7,23 @@ //===----------------------------------------------------------------------===// #include <clc/clc_convert.h> +#include <clc/float/definitions.h> #include <clc/integer/clc_clz.h> #include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_fabs.h> #include <clc/math/clc_floor.h> #include <clc/math/clc_fma.h> +#include <clc/math/clc_frexp.h> #include <clc/math/clc_ldexp.h> -#include <clc/math/clc_subnormal_config.h> +#include <clc/math/clc_nan.h> +#include <clc/math/clc_native_recip.h> +#include <clc/math/clc_rint.h> +#include <clc/math/clc_sincos_helpers.h> #include <clc/math/clc_trunc.h> #include <clc/math/math.h> +#include <clc/relational/clc_isfinite.h> +#include <clc/relational/clc_isnan.h> #include <clc/shared/clc_max.h> #define __CLC_ADDRESS_SPACE private diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc index 3a76ffed7f039..79eef077bf691 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.inc +++ b/libclc/clc/lib/generic/math/clc_remquo.inc @@ -8,69 +8,82 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, __CLC_ADDRESS_SPACE int *quo) { - x = __clc_flush_denormal_if_not_supported(x); - y = __clc_flush_denormal_if_not_supported(y); - int ux = __clc_as_int(x); - int ax = ux & EXSIGNBIT_SP32; - float xa = __clc_as_float(ax); - int sx = ux ^ ax; - int ex = ax >> EXPSHIFTBITS_SP32; - - int uy = __clc_as_int(y); - int ay = uy & EXSIGNBIT_SP32; - float ya = __clc_as_float(ay); - int sy = uy ^ ay; - int ey = ay >> EXPSHIFTBITS_SP32; - - float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff)); - float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff)); - int c; - int k = ex - ey; - - uint q = 0; - - while (k > 0) { - c = xr >= yr; - q = (q << 1) | c; - xr -= c ? yr : 0.0f; - xr += xr; - --k; + const int bits = 12; + float ax = __clc_fabs(x); + float ay = __clc_fabs(y); + float ret; + int q7; + if (ax > ay) { + int ex, ey; + { + int _exp; + float _mant = __clc_frexp(ax, &_exp); + ex = _exp - 1; + ax = __clc_ldexp(_mant, bits); + } + { + int _exp; + float _mant = __clc_frexp(ay, &_exp); + ey = _exp - 1; + ay = __clc_ldexp(_mant, 1); + } + int nb = ex - ey; + float ayinv = __clc_native_recip(ay); + int qacc = 0; + while (nb > bits) { + float q = __clc_rint(ax * ayinv); + ax = __clc_fma(-q, ay, ax); + int clt = ax < 0.0f; + float axp = ax + ay; + ax = clt ? axp : ax; + + int iq = (int)q; + iq -= clt; + qacc = (qacc << bits) | iq; + + ax = __clc_ldexp(ax, bits); + nb -= bits; + } + ax = __clc_ldexp(ax, nb - bits + 1); + { + float q = __clc_rint(ax * ayinv); + ax = __clc_fma(-q, ay, ax); + int clt = ax < 0.0f; + float axp = ax + ay; + ax = clt ? axp : ax; + int iq = (int)q; + iq -= clt; + qacc = (qacc << (nb + 1)) | iq; + } + int aq = (2.0f * ax > ay) | ((qacc & 0x1) & (2.0f * ax == ay)); + ax = ax - (aq ? ay : 0.0f); + qacc += aq; + int qneg = (__clc_as_int(x) ^ __clc_as_int(y)) >> 31; + q7 = ((qacc & 0x7f) ^ qneg) - qneg; + ax = __clc_ldexp(ax, ey); + ret = + __clc_as_float((__clc_as_int(x) & (int)0x80000000) ^ __clc_as_int(ax)); + } else { + ret = x; + q7 = 0; + bool c = (ay < 0x1.0p+127f & 2.0f * ax > ay) | (ax > 0.5f * ay); + + int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1); + float t = __clc_fma(y, -(float)qsgn, x); + ret = c ? t : __builtin_elementwise_canonicalize(x); + q7 = c ? qsgn : q7; + ret = ax == ay ? __clc_copysign(0.0f, x) : ret; + q7 = ax == ay ? qsgn : q7; } - c = xr > yr; - q = (q << 1) | c; - xr -= c ? yr : 0.0f; + ret = y == 0.0f ? __clc_nan(0) : ret; + q7 = y == 0.0f ? 0 : q7; + bool c = !__clc_isnan(y) && __clc_isfinite(x); + ret = c ? ret : __clc_nan(0); + q7 = c ? q7 : 0; - int lt = ex < ey; - - q = lt ? 0 : q; - xr = lt ? xa : xr; - yr = lt ? ya : yr; - - c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1)); - xr -= c ? yr : 0.0f; - q += c; - - float s = __clc_as_float(ey << EXPSHIFTBITS_SP32); - xr *= lt ? 1.0f : s; - - int qsgn = sx == sy ? 1 : -1; - int quot = (q & 0x7f) * qsgn; - - c = ax == ay; - quot = c ? qsgn : quot; - xr = c ? 0.0f : xr; - - xr = __clc_as_float(sx ^ __clc_as_int(xr)); - - c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | - ay == 0; - quot = c ? 0 : quot; - xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr; - - *quo = quot; - - return xr; + *quo = q7; + return ret; } // remquo signature is special, we don't have macro for this _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
