https://github.com/wenju-he updated https://github.com/llvm/llvm-project/pull/177131
>From c2e8802a874bdb368b349be588e18b410d0c1985 Mon Sep 17 00:00:00 2001 From: Wenju He <[email protected]> Date: Wed, 21 Jan 2026 07:41:41 +0100 Subject: [PATCH 1/3] [libclc] replace float remquo with amd ocml implementation Current implementation has two issues: * unconditionally soft flushes denormal. * can't pass OpenCL CTS test "test_bruteforce remquo" on intel gpu. This PR upstreams remquo implementation from https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs/ocml/src/remainderF_base.h It supports denormal and can pass OpenCL CTS test. Note __oclc_finite_only_opt is set to false as there is no dynamic dispatching for generic implementation. Number of LLVM IR instructions of function _Z6remquoffPU3AS5i increased from 96 to 678. --- libclc/clc/lib/generic/math/clc_remquo.cl | 11 +- libclc/clc/lib/generic/math/clc_remquo.inc | 150 ++++++++++++--------- 2 files changed, 99 insertions(+), 62 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl index fd83ead06d89a..db7ab8ceaa073 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.cl +++ b/libclc/clc/lib/generic/math/clc_remquo.cl @@ -7,14 +7,23 @@ //===----------------------------------------------------------------------===// #include <clc/clc_convert.h> +#include <clc/float/definitions.h> #include <clc/integer/clc_clz.h> #include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_fabs.h> #include <clc/math/clc_floor.h> #include <clc/math/clc_fma.h> +#include <clc/math/clc_frexp.h> #include <clc/math/clc_ldexp.h> -#include <clc/math/clc_subnormal_config.h> +#include <clc/math/clc_nan.h> +#include <clc/math/clc_native_recip.h> +#include <clc/math/clc_rint.h> +#include <clc/math/clc_sincos_helpers.h> #include <clc/math/clc_trunc.h> #include <clc/math/math.h> +#include <clc/relational/clc_isfinite.h> +#include <clc/relational/clc_isnan.h> #include <clc/shared/clc_max.h> #define __CLC_ADDRESS_SPACE private diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc index 3a76ffed7f039..69c9a8731e907 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.inc +++ b/libclc/clc/lib/generic/math/clc_remquo.inc @@ -8,69 +8,97 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, __CLC_ADDRESS_SPACE int *quo) { - x = __clc_flush_denormal_if_not_supported(x); - y = __clc_flush_denormal_if_not_supported(y); - int ux = __clc_as_int(x); - int ax = ux & EXSIGNBIT_SP32; - float xa = __clc_as_float(ax); - int sx = ux ^ ax; - int ex = ax >> EXPSHIFTBITS_SP32; - - int uy = __clc_as_int(y); - int ay = uy & EXSIGNBIT_SP32; - float ya = __clc_as_float(ay); - int sy = uy ^ ay; - int ey = ay >> EXPSHIFTBITS_SP32; - - float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff)); - float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff)); - int c; - int k = ex - ey; - - uint q = 0; - - while (k > 0) { - c = xr >= yr; - q = (q << 1) | c; - xr -= c ? yr : 0.0f; - xr += xr; - --k; + const int bits = 12; + float ax = __clc_fabs(x); + float ay = __clc_fabs(y); + float ret; + int q7; + if (ax > ay) { + int ex, ey; + ex = ({ + int _exp; + __clc_frexp(ax, &_exp); + _exp; + }) - + 1; + ax = __clc_ldexp(({ + int _exp; + __clc_frexp(ax, &_exp); + }), + bits); + ey = ({ + int _exp; + __clc_frexp(ay, &_exp); + _exp; + }) - + 1; + ay = __clc_ldexp(({ + int _exp; + __clc_frexp(ay, &_exp); + }), + 1); + int nb = ex - ey; + float ayinv = __clc_native_recip(ay); + int qacc = 0; + while (nb > bits) { + float q = __clc_rint(ax * ayinv); + ax = __clc_fma(-q, ay, ax); + int clt = ax < 0.0f; + float axp = ax + ay; + ax = clt ? axp : ax; + + int iq = (int)q; + iq -= clt; + qacc = (qacc << bits) | iq; + + ax = __clc_ldexp(ax, bits); + nb -= bits; + } + ax = __clc_ldexp(ax, nb - bits + 1); + { + float q = __clc_rint(ax * ayinv); + ax = __clc_fma(-q, ay, ax); + int clt = ax < 0.0f; + float axp = ax + ay; + ax = clt ? axp : ax; + int iq = (int)q; + iq -= clt; + qacc = (qacc << (nb + 1)) | iq; + } + int aq = (2.0f * ax > ay) | ((qacc & 0x1) & (2.0f * ax == ay)); + ax = ax - (aq ? ay : 0.0f); + qacc += aq; + int qneg = (__clc_as_int(x) ^ __clc_as_int(y)) >> 31; + q7 = ((qacc & 0x7f) ^ qneg) - qneg; + ax = __clc_ldexp(ax, ey); + ret = + __clc_as_float((__clc_as_int(x) & (int)0x80000000) ^ __clc_as_int(ax)); + } else { + ret = x; + q7 = 0; + bool c = (ay<0x1.0p+127f & 2.0f * ax> ay) | (ax > 0.5f * ay); + + int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1); + float t = __clc_fma(y, -(float)qsgn, x); + ret = c ? t + : (__builtin_isfpclass(__builtin_canonicalizef(0x1p-149f), 0x0040) + ? __builtin_canonicalizef(x) + : x); + q7 = c ? qsgn : q7; + ret = ax == ay ? __clc_copysign(0.0f, x) : ret; + q7 = ax == ay ? qsgn : q7; + } + bool __oclc_finite_only_opt = false; + if (!__oclc_finite_only_opt) { + ret = y == 0.0f ? __clc_nan(0) : ret; + q7 = y == 0.0f ? 0 : q7; + bool c = !__clc_isnan(y) && __clc_isfinite(x); + ret = c ? ret : __clc_nan(0); + q7 = c ? q7 : 0; } - c = xr > yr; - q = (q << 1) | c; - xr -= c ? yr : 0.0f; - - int lt = ex < ey; - - q = lt ? 0 : q; - xr = lt ? xa : xr; - yr = lt ? ya : yr; - - c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1)); - xr -= c ? yr : 0.0f; - q += c; - - float s = __clc_as_float(ey << EXPSHIFTBITS_SP32); - xr *= lt ? 1.0f : s; - - int qsgn = sx == sy ? 1 : -1; - int quot = (q & 0x7f) * qsgn; - - c = ax == ay; - quot = c ? qsgn : quot; - xr = c ? 0.0f : xr; - - xr = __clc_as_float(sx ^ __clc_as_int(xr)); - - c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | - ay == 0; - quot = c ? 0 : quot; - xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr; - - *quo = quot; - - return xr; + *quo = q7; + return ret; } // remquo signature is special, we don't have macro for this >From 84e91fe2ea08ec5ac22ec7319831408cf36021ae Mon Sep 17 00:00:00 2001 From: Wenju He <[email protected]> Date: Wed, 21 Jan 2026 11:20:26 +0100 Subject: [PATCH 2/3] always canonicalize --- libclc/clc/lib/generic/math/clc_remquo.inc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc index 69c9a8731e907..7281aef29ef44 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.inc +++ b/libclc/clc/lib/generic/math/clc_remquo.inc @@ -80,10 +80,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1); float t = __clc_fma(y, -(float)qsgn, x); - ret = c ? t - : (__builtin_isfpclass(__builtin_canonicalizef(0x1p-149f), 0x0040) - ? __builtin_canonicalizef(x) - : x); + ret = c ? t : __builtin_elementwise_canonicalize(x); q7 = c ? qsgn : q7; ret = ax == ay ? __clc_copysign(0.0f, x) : ret; q7 = ax == ay ? qsgn : q7; >From c8fd881acee9e0a7c1b00c0f61f45c26df23488f Mon Sep 17 00:00:00 2001 From: Wenju He <[email protected]> Date: Wed, 21 Jan 2026 18:22:53 +0800 Subject: [PATCH 3/3] Update libclc/clc/lib/generic/math/clc_remquo.inc Co-authored-by: Copilot <[email protected]> --- libclc/clc/lib/generic/math/clc_remquo.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc index 7281aef29ef44..4babbc60727d2 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.inc +++ b/libclc/clc/lib/generic/math/clc_remquo.inc @@ -76,7 +76,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, } else { ret = x; q7 = 0; - bool c = (ay<0x1.0p+127f & 2.0f * ax> ay) | (ax > 0.5f * ay); + bool c = (ay < 0x1.0p+127f & 2.0f * ax > ay) | (ax > 0.5f * ay); int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1); float t = __clc_fma(y, -(float)qsgn, x); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
