https://github.com/karolherbst created https://github.com/llvm/llvm-project/pull/189781
I'm seeing regressions among `atan2`, `atan2pi`, remainder`, `remquo`, `sinpi` and other functions. I'm not in the mood to figure out why those commit broke it, hence I'm simply sending a bunch of reverts to let me pass the `math_brute_force` OpenCL CTS tests. >From b9be58300b1c4382eea84d64773d5f7bf0f6885c Mon Sep 17 00:00:00 2001 From: Karol Herbst <[email protected]> Date: Wed, 1 Apr 2026 02:41:02 +0200 Subject: [PATCH 1/5] Revert "libclc: Update atan2 (#188706)" This reverts commit 158282b17652acd3a5ab282dbcc54967fd9837d7. --- libclc/clc/lib/generic/math/clc_atan2.cl | 11 +- libclc/clc/lib/generic/math/clc_atan2.inc | 305 ++++++++++++++++------ 2 files changed, 224 insertions(+), 92 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_atan2.cl b/libclc/clc/lib/generic/math/clc_atan2.cl index 04bb53e357260..1ba6751f5a0f1 100644 --- a/libclc/clc/lib/generic/math/clc_atan2.cl +++ b/libclc/clc/lib/generic/math/clc_atan2.cl @@ -6,20 +6,21 @@ // //===----------------------------------------------------------------------===// +#include "clc/clc_convert.h" #include "clc/float/definitions.h" #include "clc/internal/clc.h" -#include "clc/math/clc_atan_helpers.h" #include "clc/math/clc_copysign.h" #include "clc/math/clc_fabs.h" #include "clc/math/clc_fma.h" -#include "clc/math/clc_fmax.h" -#include "clc/math/clc_fmin.h" #include "clc/math/clc_ldexp.h" #include "clc/math/clc_mad.h" +#include "clc/math/math.h" +#include "clc/math/tables.h" #include "clc/relational/clc_isinf.h" -#include "clc/relational/clc_isunordered.h" +#include "clc/relational/clc_isnan.h" #include "clc/relational/clc_select.h" -#include "clc/relational/clc_signbit.h" +#include "clc/shared/clc_max.h" +#include "clc/shared/clc_min.h" #define __CLC_BODY "clc_atan2.inc" #include "clc/math/gentype.inc" diff --git a/libclc/clc/lib/generic/math/clc_atan2.inc b/libclc/clc/lib/generic/math/clc_atan2.inc index f8e7c9638180d..19eaaeee0092d 100644 --- a/libclc/clc/lib/generic/math/clc_atan2.inc +++ b/libclc/clc/lib/generic/math/clc_atan2.inc @@ -6,112 +6,243 @@ // //===----------------------------------------------------------------------===// -#pragma OPENCL FP_CONTRACT OFF - #if __CLC_FPSIZE == 32 -_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_FLOATN __clc_atan2(__CLC_FLOATN y, - __CLC_FLOATN x) { - const __CLC_FLOATN pi = 0x1.921fb6p+1f; - const __CLC_FLOATN piby2 = 0x1.921fb6p+0f; - const __CLC_FLOATN piby4 = 0x1.921fb6p-1f; - const __CLC_FLOATN threepiby4 = 0x1.2d97c8p+1f; - - __CLC_FLOATN ax = __clc_fabs(x); - __CLC_FLOATN ay = __clc_fabs(y); - __CLC_FLOATN v = __clc_fmin(ax, ay); - __CLC_FLOATN u = __clc_fmax(ax, ay); - - __CLC_FLOATN vbyu = v / u; +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, + __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 0x1.921fb6p+1f; + const __CLC_GENTYPE piby2 = 0x1.921fb6p+0f; + const __CLC_GENTYPE piby4 = 0x1.921fb6p-1f; + const __CLC_GENTYPE threepiby4 = 0x1.2d97c8p+1f; + + __CLC_GENTYPE ax = __clc_fabs(x); + __CLC_GENTYPE ay = __clc_fabs(y); + __CLC_GENTYPE v = __clc_min(ax, ay); + __CLC_GENTYPE u = __clc_max(ax, ay); + + // Scale since u could be large, as in "regular" divide + __CLC_GENTYPE s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; + __CLC_GENTYPE vbyu = s * MATH_DIVIDE(v, s * u); + + __CLC_GENTYPE vbyu2 = vbyu * vbyu; + +#define USE_2_2_APPROXIMATION +#if defined USE_2_2_APPROXIMATION + __CLC_GENTYPE p = + __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), + -0x1.5554d0p-2f) * + vbyu2 * vbyu; + __CLC_GENTYPE q = + __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); +#else + __CLC_GENTYPE p = + __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), + -0x1.55554ep-2f) * + vbyu2 * vbyu; + __CLC_GENTYPE q = __clc_mad( + vbyu2, + __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), + 0x1.76b4b8p+0f), + 1.0f); +#endif - __CLC_FLOATN a = __clc_atan_reduced(vbyu); + // Octant 0 result + __CLC_GENTYPE a = __clc_mad(p, MATH_RECIP(q), vbyu); - __CLC_FLOATN t = piby2 - a; - a = ay > ax ? t : a; - t = pi - a; - a = x < 0.0f ? t : a; + // Fix up 3 other octants + __CLC_GENTYPE at = piby2 - a; + a = ay > ax ? at : a; + at = pi - a; + a = x < 0.0F ? at : a; - t = __clc_signbit(x) ? pi : 0.0f; - a = y == 0.0f ? t : a; + // y == 0 => 0 for x >= 0, pi for x < 0 + at = __CLC_AS_INTN(x) < 0 ? pi : 0.0f; + a = y == 0.0f ? at : a; // x and y are +- Inf - t = x < 0.0f ? threepiby4 : piby4; - a = (__clc_isinf(x) & __clc_isinf(y)) ? t : a; + at = x > 0.0f ? piby4 : threepiby4; + a = __clc_select(a, at, __clc_isinf(x) && __clc_isinf(y)); // x or y is NaN - a = __clc_isunordered(x, y) ? FLT_NAN : a; + a = __clc_select(a, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isnan(y)); + // Fixup sign and return return __clc_copysign(a, y); } #elif __CLC_FPSIZE == 64 -_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_DOUBLEN __clc_atan2(__CLC_DOUBLEN y, - __CLC_DOUBLEN x) { - const __CLC_DOUBLEN pi = 0x1.921fb54442d18p+1; - const __CLC_DOUBLEN piby2 = 0x1.921fb54442d18p+0; - const __CLC_DOUBLEN piby4 = 0x1.921fb54442d18p-1; - const __CLC_DOUBLEN threepiby4 = 0x1.2d97c7f3321d2p+1; - - __CLC_DOUBLEN ay = __clc_fabs(y); - __CLC_DOUBLEN ax = __clc_fabs(x); - __CLC_DOUBLEN u = __clc_fmax(ax, ay); - __CLC_DOUBLEN v = __clc_fmin(ax, ay); - __CLC_DOUBLEN vbyu = v / u; - - __CLC_DOUBLEN a = __clc_atan_reduced(vbyu); - __CLC_LONGN xneg = __clc_signbit(x); - - __CLC_DOUBLEN t = piby2 - a; - a = ax < ay ? t : a; - t = pi - a; - a = xneg ? t : a; - - t = xneg ? pi : 0.0; - a = y == 0.0 ? t : a; - - t = xneg ? threepiby4 : piby4; - t = __clc_copysign(t, y); - a = (__clc_isinf(x) && __clc_isinf(y)) ? t : a; - - a = __clc_isunordered(x, y) ? DBL_NAN : a; - return __clc_copysign(a, y); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, + __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const __CLC_GENTYPE piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ + const __CLC_GENTYPE piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + // 0x4002d97c7f3321d2 + const __CLC_GENTYPE three_piby4 = 2.3561944901923449e+00; + const __CLC_GENTYPE pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const __CLC_GENTYPE pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + // 0x3ff921fb54442d18 + const __CLC_GENTYPE piby2_head = 1.5707963267948965e+00; + // 0x3c91a62633145c07 + const __CLC_GENTYPE piby2_tail = 6.1232339957367660e-17; + + __CLC_GENTYPE x2 = x; + // Important to capture -0.0 in xneg and yneg, so comparison done as integer + __CLC_LONGN xneg = __CLC_AS_LONGN(x) < 0; + __CLC_INTN xexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_GENTYPE y2 = y; + __CLC_LONGN yneg = __CLC_AS_LONGN(y) < 0; + __CLC_INTN yexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_LONGN cond2 = __CLC_CONVERT_LONGN(xexp < 1021 && yexp < 1021); + __CLC_LONGN diffexp = __CLC_CONVERT_LONGN(yexp - xexp); + + // Scale up both x and y if they are both below 1/4 + __CLC_GENTYPE x1 = __clc_ldexp(x, 1024); + __CLC_INTN xexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_GENTYPE y1 = __clc_ldexp(y, 1024); + __CLC_INTN yexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_LONGN diffexp1 = __CLC_CONVERT_LONGN(yexp1 - xexp1); + + diffexp = __clc_select(diffexp, diffexp1, cond2); + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + __CLC_GENTYPE u = __clc_fabs(x); + __CLC_GENTYPE v = __clc_fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + __CLC_LONGN swap_vu = u < v; + __CLC_GENTYPE uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + __CLC_GENTYPE vbyu = v / u; + __CLC_GENTYPE q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063; + __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5)); + q1 = __CLC_USE_TABLE(atan_jby256_tbl_head, index - 16); + q2 = __CLC_USE_TABLE(atan_jby256_tbl_tail, index - 16); + __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + __CLC_INTN m = + -(__CLC_CONVERT_INTN(__CLC_AS_ULONGN(u) >> EXPSHIFTBITS_DP64) - + EXPBIAS_DP64); + __CLC_GENTYPE um = __clc_ldexp(u, m); + __CLC_GENTYPE vm = __clc_ldexp(v, m); + + // 26 leading bits of u + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL); + __CLC_GENTYPE u2 = um - u1; + + __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)), + __clc_fma(c, vm, um)); + + // Polynomial approximation to atan(r) + __CLC_GENTYPE s = r * r; + q2 = q2 + __clc_fma((s * __clc_fma(-s, 0.19999918038989143496, + 0.33333333333224095522)), + -r, r); + } + + __CLC_GENTYPE q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + __CLC_GENTYPE q5, q6; + { + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL); + __CLC_GENTYPE u2 = u - u1; + __CLC_GENTYPE vu1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL); + __CLC_GENTYPE vu2 = vbyu - vu1; + + q5 = 0.0; + __CLC_GENTYPE s = vbyu * vbyu; + q6 = vbyu + + __clc_fma( + -vbyu * s, + __clc_fma( + -s, + __clc_fma(-s, + __clc_fma(-s, + __clc_fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(__clc_fma(-u, vu2, + __clc_fma(-u2, vu1, __clc_fma(-u1, vu1, v))), + u)); + } + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + __CLC_GENTYPE res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = q1 + q2; + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -three_piby4 : three_piby4; + res2 = yneg ? -piby4 : piby4; + res3 = xneg ? res1 : res2; + + res3 = __clc_select(res4, res3, + __CLC_CONVERT_LONGN(__clc_isinf(x2) && __clc_isinf(y2))); + res1 = yneg ? -pi : pi; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = (diffexp < -56 && xneg) ? res1 : res3; + + res4 = MATH_DIVIDE(y, x); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 && xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + // Zero y gives +-0 for positive x and +-pi for negative x + res3 = y2 == 0.0 ? res4 : res3; + res3 = __clc_isnan(y2) ? y2 : res3; + res3 = __clc_isnan(x2) ? x2 : res3; + + return res3; } #elif __CLC_FPSIZE == 16 -_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_HALFN __clc_atan2(__CLC_HALFN y, - __CLC_HALFN x) { - const __CLC_HALFN pi = 0x1.921fb6p+1h; - const __CLC_HALFN piby2 = 0x1.921fb6p+0h; - const __CLC_HALFN piby4 = 0x1.921fb6p-1h; - const __CLC_HALFN threepiby4 = 0x1.2d97c8p+1h; - - __CLC_HALFN ax = __clc_fabs(x); - __CLC_HALFN ay = __clc_fabs(y); - __CLC_HALFN v = __clc_fmin(ax, ay); - __CLC_HALFN u = __clc_fmax(ax, ay); - - __CLC_HALFN vbyu = v / u; - - __CLC_HALFN a = __clc_atan_reduced(vbyu); - - __CLC_HALFN t = piby2 - a; - a = ay > ax ? t : a; - t = pi - a; - a = x < 0.0h ? t : a; - - t = __clc_signbit(x) ? pi : 0.0h; - a = y == 0.0h ? t : a; - - // x and y are +- Inf - t = x < 0.0h ? threepiby4 : piby4; - a = (__clc_isinf(x) && __clc_isinf(y)) ? t : a; - - // x or y is NaN - a = __clc_isunordered(x, y) ? HALF_NAN : a; - - return __clc_copysign(a, y); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE x, + __CLC_GENTYPE y) { + return __CLC_CONVERT_GENTYPE( + __clc_atan2(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y))); } #endif >From 2ee1ad436bfcd85efd3e95b3e65e64edd0512c08 Mon Sep 17 00:00:00 2001 From: Karol Herbst <[email protected]> Date: Wed, 1 Apr 2026 02:42:19 +0200 Subject: [PATCH 2/5] Revert "libclc: Update atan2pi (#188707)" This reverts commit 8430e9e8f043d2ea28925e299157f760711cbedb. --- libclc/clc/lib/generic/math/clc_atan2pi.cl | 10 +- libclc/clc/lib/generic/math/clc_atan2pi.inc | 263 ++++++++++++++------ 2 files changed, 198 insertions(+), 75 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.cl b/libclc/clc/lib/generic/math/clc_atan2pi.cl index cd18b4876da86..cee4d8451d85a 100644 --- a/libclc/clc/lib/generic/math/clc_atan2pi.cl +++ b/libclc/clc/lib/generic/math/clc_atan2pi.cl @@ -6,19 +6,19 @@ // //===----------------------------------------------------------------------===// +#include "clc/clc_convert.h" #include "clc/float/definitions.h" #include "clc/internal/clc.h" -#include "clc/math/clc_atan_helpers.h" #include "clc/math/clc_copysign.h" #include "clc/math/clc_fabs.h" #include "clc/math/clc_fma.h" -#include "clc/math/clc_fmax.h" -#include "clc/math/clc_fmin.h" +#include "clc/math/clc_ldexp.h" #include "clc/math/clc_mad.h" +#include "clc/math/math.h" +#include "clc/math/tables.h" #include "clc/relational/clc_isinf.h" -#include "clc/relational/clc_isunordered.h" +#include "clc/relational/clc_isnan.h" #include "clc/relational/clc_select.h" -#include "clc/relational/clc_signbit.h" #include "clc/shared/clc_max.h" #include "clc/shared/clc_min.h" diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.inc b/libclc/clc/lib/generic/math/clc_atan2pi.inc index 76b5e45547173..9f901947887d8 100644 --- a/libclc/clc/lib/generic/math/clc_atan2pi.inc +++ b/libclc/clc/lib/generic/math/clc_atan2pi.inc @@ -6,99 +6,222 @@ // //===----------------------------------------------------------------------===// -#pragma OPENCL FP_CONTRACT OFF - #if __CLC_FPSIZE == 32 -_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_FLOATN __clc_atan2pi(__CLC_FLOATN y, - __CLC_FLOATN x) { - __CLC_FLOATN ax = __clc_fabs(x); - __CLC_FLOATN ay = __clc_fabs(y); - __CLC_FLOATN v = __clc_fmin(ax, ay); - __CLC_FLOATN u = __clc_fmax(ax, ay); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y, + __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 0x1.921fb6p+1f; + + __CLC_GENTYPE ax = __clc_fabs(x); + __CLC_GENTYPE ay = __clc_fabs(y); + __CLC_GENTYPE v = __clc_min(ax, ay); + __CLC_GENTYPE u = __clc_max(ax, ay); + + // Scale since u could be large, as in "regular" divide + __CLC_GENTYPE s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; + __CLC_GENTYPE vbyu = s * MATH_DIVIDE(v, s * u); - __CLC_FLOATN vbyu = v / u; + __CLC_GENTYPE vbyu2 = vbyu * vbyu; - __CLC_FLOATN a = __clc_atanpi_reduced(vbyu); + __CLC_GENTYPE p = + __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), + -0x1.5554d0p-2f) * + vbyu2 * vbyu; + __CLC_GENTYPE q = + __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); - __CLC_FLOATN at = 0.5f - a; + // Octant 0 result + __CLC_GENTYPE a = MATH_DIVIDE(__clc_mad(p, MATH_RECIP(q), vbyu), pi); + + // Fix up 3 other octants + __CLC_GENTYPE at = 0.5f - a; a = ay > ax ? at : a; at = 1.0f - a; - a = x < 0.0f ? at : a; + a = x < 0.0F ? at : a; - at = __clc_signbit(x) ? 1.0f : 0.0f; + // y == 0 => 0 for x >= 0, pi for x < 0 + at = __CLC_AS_INTN(x) < 0 ? 1.0f : 0.0f; a = y == 0.0f ? at : a; // x and y are +- Inf - at = x < 0.0f ? 0.75f : 0.25f; - a = (__clc_isinf(x) & __clc_isinf(y)) ? at : a; + at = x > 0.0f ? 0.25f : 0.75f; + a = __clc_select(a, at, __clc_isinf(x) && __clc_isinf(y)); // x or y is NaN - a = __clc_isunordered(x, y) ? FLT_NAN : a; + a = __clc_select(a, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isnan(y)); + // Fixup sign and return return __clc_copysign(a, y); } #elif __CLC_FPSIZE == 64 -_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_DOUBLEN __clc_atan2pi(__CLC_DOUBLEN y, - __CLC_DOUBLEN x) { - __CLC_DOUBLEN ay = __clc_fabs(y); - __CLC_DOUBLEN ax = __clc_fabs(x); - __CLC_DOUBLEN u = __clc_fmax(ax, ay); - __CLC_DOUBLEN v = __clc_fmin(ax, ay); - __CLC_DOUBLEN vbyu = v / u; - - __CLC_DOUBLEN a = __clc_atanpi_reduced(vbyu); - - __CLC_LONGN xneg = __clc_signbit(x); - - __CLC_DOUBLEN t = 0.5 - a; - a = ax < ay ? t : a; - t = 1.0 - a; - a = xneg ? t : a; - - t = xneg ? 1.0 : 0.0; - a = y == 0.0 ? t : a; - - t = xneg ? 0.75 : 0.25; - t = __clc_copysign(t, y); - a = (__clc_isinf(x) & __clc_isinf(y)) ? t : a; - - a = __clc_isunordered(x, y) ? DBL_NAN : a; - - return __clc_copysign(a, y); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y, + __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const __CLC_GENTYPE pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const __CLC_GENTYPE pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + // 0x3ff921fb54442d18 + const __CLC_GENTYPE piby2_head = 1.5707963267948965e+00; + // 0x3c91a62633145c07 + const __CLC_GENTYPE piby2_tail = 6.1232339957367660e-17; + + __CLC_GENTYPE x2 = x; + __CLC_LONGN xneg = __CLC_AS_LONGN(x) < 0; + __CLC_INTN xexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_GENTYPE y2 = y; + __CLC_LONGN yneg = __CLC_AS_LONGN(y) < 0; + __CLC_INTN yexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_LONGN cond2 = __CLC_CONVERT_LONGN(xexp < 1021 & yexp < 1021); + __CLC_LONGN diffexp = __CLC_CONVERT_LONGN(yexp - xexp); + + // Scale up both x and y if they are both below 1/4 + __CLC_GENTYPE x1 = __clc_ldexp(x, 1024); + __CLC_INTN xexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_GENTYPE y1 = __clc_ldexp(y, 1024); + __CLC_INTN yexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_LONGN diffexp1 = __CLC_CONVERT_LONGN(yexp1 - xexp1); + + diffexp = __clc_select(diffexp, diffexp1, cond2); + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + __CLC_GENTYPE u = __clc_fabs(x); + __CLC_GENTYPE v = __clc_fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + __CLC_LONGN swap_vu = u < v; + __CLC_GENTYPE uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + __CLC_GENTYPE vbyu = v / u; + __CLC_GENTYPE q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063; + __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5)); + q1 = __CLC_USE_TABLE(atan_jby256_tbl_head, (index - 16)); + q2 = __CLC_USE_TABLE(atan_jby256_tbl_tail, (index - 16)); + __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + __CLC_INTN m = + -(__CLC_CONVERT_INTN(__CLC_AS_ULONGN(u) >> EXPSHIFTBITS_DP64) - + EXPBIAS_DP64); + __CLC_GENTYPE um = __clc_ldexp(u, m); + __CLC_GENTYPE vm = __clc_ldexp(v, m); + + // 26 leading bits of u + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL); + __CLC_GENTYPE u2 = um - u1; + + __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)), + __clc_fma(c, vm, um)); + + // Polynomial approximation to atan(r) + __CLC_GENTYPE s = r * r; + q2 = q2 + __clc_fma((s * __clc_fma(-s, 0.19999918038989143496, + 0.33333333333224095522)), + -r, r); + } + + __CLC_GENTYPE q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + __CLC_GENTYPE q5, q6; + { + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL); + __CLC_GENTYPE u2 = u - u1; + __CLC_GENTYPE vu1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL); + __CLC_GENTYPE vu2 = vbyu - vu1; + + q5 = 0.0; + __CLC_GENTYPE s = vbyu * vbyu; + q6 = vbyu + + __clc_fma( + -vbyu * s, + __clc_fma( + -s, + __clc_fma(-s, + __clc_fma(-s, + __clc_fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(__clc_fma(-u, vu2, + __clc_fma(-u2, vu1, __clc_fma(-u1, vu1, v))), + u)); + } + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + __CLC_GENTYPE res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = MATH_DIVIDE(q1 + q2, pi); + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -0.75 : 0.75; + res2 = yneg ? -0.25 : 0.25; + res3 = xneg ? res1 : res2; + + res3 = __clc_select(res4, res3, + __CLC_CONVERT_LONGN(__clc_isinf(y2) & __clc_isinf(x2))); + res1 = yneg ? -1.0 : 1.0; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = diffexp < -56 && xneg ? res1 : res3; + + res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 && xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + // Zero y gives +-0 for positive x and +-pi for negative x + res3 = y2 == 0.0 ? res4 : res3; + res3 = __clc_isnan(y2) ? y2 : res3; + res3 = __clc_isnan(x2) ? x2 : res3; + + return res3; } #elif __CLC_FPSIZE == 16 -_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_HALFN __clc_atan2pi(__CLC_HALFN y, - __CLC_HALFN x) { - __CLC_HALFN ax = __clc_fabs(x); - __CLC_HALFN ay = __clc_fabs(y); - __CLC_HALFN v = __clc_fmin(ax, ay); - __CLC_HALFN u = __clc_fmax(ax, ay); - - __CLC_HALFN vbyu = v / u; - - __CLC_HALFN a = __clc_atanpi_reduced(vbyu); - - __CLC_HALFN at = 0.5h - a; - a = ay > ax ? at : a; - at = 1.0h - a; - a = x < 0.0h ? at : a; - - at = __clc_signbit(x) ? 1.0h : 0.0h; - a = y == 0.0h ? at : a; - - // x and y are +- Inf - at = x < 0.0h ? 0.75h : 0.25h; - a = (__clc_isinf(x) & __clc_isinf(y)) ? at : a; - - // x or y is NaN - a = __clc_isunordered(x, y) ? HALF_NAN : a; - - return __clc_copysign(a, y); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE x, + __CLC_GENTYPE y) { + return __CLC_CONVERT_GENTYPE( + __clc_atan2pi(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y))); } #endif >From 457734770dd66b0a2c6f350939baef7e6732be5c Mon Sep 17 00:00:00 2001 From: Karol Herbst <[email protected]> Date: Wed, 1 Apr 2026 02:43:25 +0200 Subject: [PATCH 3/5] Revert "libclc: Implement remainder with remquo" This reverts commit befad798a91ef8c74f605bdb15c352528f8d9807. --- libclc/clc/lib/generic/math/clc_remainder.cl | 223 +++++++++++++++++- libclc/clc/lib/generic/math/clc_remainder.inc | 13 - 2 files changed, 221 insertions(+), 15 deletions(-) delete mode 100644 libclc/clc/lib/generic/math/clc_remainder.inc diff --git a/libclc/clc/lib/generic/math/clc_remainder.cl b/libclc/clc/lib/generic/math/clc_remainder.cl index d3979fbac3ffd..f1dba87ee5b43 100644 --- a/libclc/clc/lib/generic/math/clc_remainder.cl +++ b/libclc/clc/lib/generic/math/clc_remainder.cl @@ -6,8 +6,227 @@ // //===----------------------------------------------------------------------===// +#include "clc/clc_convert.h" +#include "clc/integer/clc_clz.h" +#include "clc/internal/clc.h" +#include "clc/math/clc_floor.h" +#include "clc/math/clc_fma.h" +#include "clc/math/clc_ldexp.h" #include "clc/math/clc_remainder.h" -#include "clc/math/clc_remquo.h" +#include "clc/math/clc_trunc.h" +#include "clc/math/math.h" +#include "clc/shared/clc_max.h" -#define __CLC_BODY "clc_remainder.inc" +_CLC_DEF _CLC_OVERLOAD float __clc_remainder(float x, float y) { + int ux = __clc_as_int(x); + int ax = ux & EXSIGNBIT_SP32; + float xa = __clc_as_float(ax); + int sx = ux ^ ax; + int ex = ax >> EXPSHIFTBITS_SP32; + + int uy = __clc_as_int(y); + int ay = uy & EXSIGNBIT_SP32; + float ya = __clc_as_float(ay); + int ey = ay >> EXPSHIFTBITS_SP32; + + float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff)); + float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff)); + int c; + int k = ex - ey; + + uint q = 0; + + while (k > 0) { + c = xr >= yr; + q = (q << 1) | c; + xr -= c ? yr : 0.0f; + xr += xr; + --k; + } + + c = xr > yr; + q = (q << 1) | c; + xr -= c ? yr : 0.0f; + + int lt = ex < ey; + + q = lt ? 0 : q; + xr = lt ? xa : xr; + yr = lt ? ya : yr; + + c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1)); + xr -= c ? yr : 0.0f; + q += c; + + float s = __clc_as_float(ey << EXPSHIFTBITS_SP32); + xr *= lt ? 1.0f : s; + + c = ax == ay; + xr = c ? 0.0f : xr; + + xr = __clc_as_float(sx ^ __clc_as_int(xr)); + + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | + ay == 0; + xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr; + + return xr; +} + +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_remainder +#define __CLC_BODY "clc/shared/binary_def_scalarize.inc" #include "clc/math/gentype.inc" +#undef __CLC_FUNCTION + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) { + ulong ux = __clc_as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong xsgn = ux ^ ax; + double dx = __clc_as_double(ax); + int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64); + int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64); + xexp1 = xexp < 1 ? xexp1 : xexp; + + ulong uy = __clc_as_ulong(y); + ulong ay = uy & ~SIGNBIT_DP64; + double dy = __clc_as_double(ay); + int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64); + int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64); + yexp1 = yexp < 1 ? yexp1 : yexp; + + int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1; + + // First assume |x| > |y| + + // Set ntimes to the number of times we need to do a + // partial remainder. If the exponent of x is an exact multiple + // of 53 larger than the exponent of y, and the mantissa of x is + // less than the mantissa of y, ntimes will be one too large + // but it doesn't matter - it just means that we'll go round + // the loop below one extra time. + int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); + double w = __clc_ldexp(dy, ntimes * 53); + w = ntimes == 0 ? dy : w; + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; + + // Each time round the loop we compute a partial remainder. + // This is done by subtracting a large multiple of w + // from x each time, where w is a scaled up version of y. + // The subtraction must be performed exactly in quad + // precision, though the result at each stage can + // fit exactly in a double precision number. + int i; + double t, v, p, pp; + + for (i = 0; i < ntimes; i++) { + // Compute integral multiplier + t = __clc_trunc(dx / w); + + // Compute w * t in quad precision + p = w * t; + pp = __clc_fma(w, t, -p); + + // Subtract w * t from dx + v = dx - p; + dx = v + (((dx - v) - p) - pp); + + // If t was one too large, dx will be negative. Add back one w. + dx += dx < 0.0 ? w : 0.0; + + // Scale w down by 2^(-53) for the next iteration + w *= scale; + } + + // One more time + // Variable todd says whether the integer t is odd or not + t = __clc_floor(dx / w); + long lt = (long)t; + int todd = lt & 1; + + p = w * t; + pp = __clc_fma(w, t, -p); + v = dx - p; + dx = v + (((dx - v) - p) - pp); + i = dx < 0.0; + todd ^= i; + dx += i ? w : 0.0; + + // At this point, dx lies in the range [0,dy) + + // For the fmod function, we're done apart from setting the correct sign. + // + // For the remainder function, we need to adjust dx + // so that it lies in the range (-y/2, y/2] by carefully + // subtracting w (== dy == y) if necessary. The rigmarole + // with todd is to get the correct sign of the result + // when x/y lies exactly half way between two integers, + // when we need to choose the even integer. + + int al = (2.0 * dx > w) | (todd & (2.0 * dx == w)); + double dxl = dx - (al ? w : 0.0); + + int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w)); + double dxg = dx - (ag ? w : 0.0); + + dx = dy < 0x1.0p+1022 ? dxl : dxg; + + double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx)); + dx = __clc_as_double(ax); + + // Now handle |x| == |y| + int c = dx == dy; + t = __clc_as_double(xsgn); + ret = c ? t : ret; + + // Next, handle |x| < |y| + c = dx < dy; + ret = c ? x : ret; + + c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy); + // we could use a conversion here instead since qsgn = +-1 + p = qsgn == 1 ? -1.0 : 1.0; + t = __clc_fma(y, p, x); + ret = c ? t : ret; + + // We don't need anything special for |x| == 0 + + // |y| is 0 + c = dy == 0.0; + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; + + // y is +-Inf, NaN + c = yexp > BIASEDEMAX_DP64; + t = y == y ? x : y; + ret = c ? t : ret; + + // x is +=Inf, NaN + c = xexp > BIASEDEMAX_DP64; + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; + + return ret; +} + +#define __CLC_DOUBLE_ONLY +#define __CLC_FUNCTION __clc_remainder +#define __CLC_BODY "clc/shared/binary_def_scalarize.inc" +#include "clc/math/gentype.inc" +#undef __CLC_FUNCTION + +#endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +// Forward the half version of this builtin onto the float one +#define __CLC_HALF_ONLY +#define __CLC_FUNCTION __clc_remainder +#define __CLC_BODY "clc/math/binary_def_via_fp32.inc" +#include "clc/math/gentype.inc" + +#endif diff --git a/libclc/clc/lib/generic/math/clc_remainder.inc b/libclc/clc/lib/generic/math/clc_remainder.inc deleted file mode 100644 index 9f2e0ba1539a6..0000000000000 --- a/libclc/clc/lib/generic/math/clc_remainder.inc +++ /dev/null @@ -1,13 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE -__clc_remainder(__CLC_GENTYPE x, __CLC_GENTYPE y) { - __CLC_INTN unused_quo; - return __clc_remquo(x, y, &unused_quo); -} >From e30a3c43bff5ffa50ca14570173bea3eefd3f703 Mon Sep 17 00:00:00 2001 From: Karol Herbst <[email protected]> Date: Wed, 1 Apr 2026 02:44:30 +0200 Subject: [PATCH 4/5] Revert "libclc: Update remquo (#187998)" This reverts commit 1a9fe1769a0f9684dc9ecfcb7a2cec2d66077cc3. --- libclc/clc/include/clc/math/remquo_decl.inc | 34 +-- libclc/clc/lib/generic/math/clc_remquo.cl | 56 +--- libclc/clc/lib/generic/math/clc_remquo.inc | 273 +++++++++++++++++- .../clc/lib/generic/math/clc_remquo_stret.inc | 158 ---------- 4 files changed, 287 insertions(+), 234 deletions(-) delete mode 100644 libclc/clc/lib/generic/math/clc_remquo_stret.inc diff --git a/libclc/clc/include/clc/math/remquo_decl.inc b/libclc/clc/include/clc/math/remquo_decl.inc index 8ba601199ef0f..cba28a7244eb4 100644 --- a/libclc/clc/include/clc/math/remquo_decl.inc +++ b/libclc/clc/include/clc/math/remquo_decl.inc @@ -6,29 +6,19 @@ // //===----------------------------------------------------------------------===// -typedef struct __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE) { - __CLC_GENTYPE rem; - __CLC_INTN quo; -} __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + private __CLC_INTN *q); -#define __CLC_REMQUO_RET_GENTYPE __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE) +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + global __CLC_INTN *q); -_CLC_OVERLOAD _CLC_DECL __CLC_REMQUO_RET_GENTYPE -__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y); - -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x, - __CLC_GENTYPE y, - private __CLC_INTN *q); - -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x, - __CLC_GENTYPE y, - global __CLC_INTN *q); - -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x, - __CLC_GENTYPE y, - local __CLC_INTN *q); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + local __CLC_INTN *q); #if _CLC_GENERIC_AS_SUPPORTED -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x, - __CLC_GENTYPE y, - generic __CLC_INTN *q); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + generic __CLC_INTN *q); #endif diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl index 502b9e5edc405..e254093d591d4 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.cl +++ b/libclc/clc/lib/generic/math/clc_remquo.cl @@ -6,58 +6,32 @@ // //===----------------------------------------------------------------------===// -#include "clc/math/clc_remquo.h" - #include "clc/clc_convert.h" -#include "clc/float/definitions.h" #include "clc/integer/clc_clz.h" -#include "clc/math/clc_copysign.h" -#include "clc/math/clc_fabs.h" +#include "clc/internal/clc.h" +#include "clc/math/clc_floor.h" #include "clc/math/clc_flush_if_daz.h" #include "clc/math/clc_fma.h" -#include "clc/math/clc_frexp.h" #include "clc/math/clc_ldexp.h" -#include "clc/math/clc_mad.h" -#include "clc/math/clc_recip_fast.h" -#include "clc/math/clc_rint.h" #include "clc/math/clc_subnormal_config.h" #include "clc/math/clc_trunc.h" #include "clc/math/math.h" -#include "clc/relational/clc_isfinite.h" -#include "clc/relational/clc_isnan.h" -#include "clc/relational/clc_signbit.h" - -#define __CLC_FUNCTION __clc_remquo_stret -#define __CLC_BODY "clc_remquo_stret.inc" -#include "clc/math/gentype.inc" -#undef __CLC_FUNCTION - -#define __CLC_FUNCTION __clc_remquo -#define __CLC_BODY "clc_remquo.inc" -#include "clc/math/gentype.inc" +#include "clc/shared/clc_max.h" -#define __CLC_OUT_ARG3_SCALAR_TYPE int -#define __CLC_OUT_ARG3_ADDRESS_SPACE __private -#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc" -#include "clc/math/gentype.inc" -#undef __CLC_OUT_ARG3_ADDRESS_SPACE +#define __CLC_ADDRESS_SPACE private +#include "clc_remquo.inc" +#undef __CLC_ADDRESS_SPACE -#define __CLC_OUT_ARG3_SCALAR_TYPE int -#define __CLC_OUT_ARG3_ADDRESS_SPACE __local -#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc" -#include "clc/math/gentype.inc" -#undef __CLC_OUT_ARG3_ADDRESS_SPACE +#define __CLC_ADDRESS_SPACE global +#include "clc_remquo.inc" +#undef __CLC_ADDRESS_SPACE -#define __CLC_OUT_ARG3_SCALAR_TYPE int -#define __CLC_OUT_ARG3_ADDRESS_SPACE __global -#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc" -#include "clc/math/gentype.inc" -#undef __CLC_OUT_ARG3_ADDRESS_SPACE +#define __CLC_ADDRESS_SPACE local +#include "clc_remquo.inc" +#undef __CLC_ADDRESS_SPACE #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define __CLC_OUT_ARG3_SCALAR_TYPE int -#define __CLC_OUT_ARG3_ADDRESS_SPACE -#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc" -#include "clc/math/gentype.inc" -#undef __CLC_OUT_ARG3_ADDRESS_SPACE +#define __CLC_ADDRESS_SPACE generic +#include "clc_remquo.inc" +#undef __CLC_ADDRESS_SPACE #endif diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc index 649bdd9ee8b65..cf8a5ebcea20c 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.inc +++ b/libclc/clc/lib/generic/math/clc_remquo.inc @@ -6,19 +6,266 @@ // //===----------------------------------------------------------------------===// -#ifdef __CLC_SCALAR -#define __CLC_REMQUO_DEF(addrspace) \ - _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_remquo( \ - __CLC_GENTYPE x, __CLC_GENTYPE y, addrspace __CLC_INTN *quo_out) { \ - __CLC_REMQUO_RET_GENTYPE result = __clc_remquo_stret(x, y); \ - *quo_out = result.quo; \ - return result.rem; \ +_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, + __CLC_ADDRESS_SPACE int *quo) { + x = __clc_flush_if_daz(x); + y = __clc_flush_if_daz(y); + int ux = __clc_as_int(x); + int ax = ux & EXSIGNBIT_SP32; + float xa = __clc_as_float(ax); + int sx = ux ^ ax; + int ex = ax >> EXPSHIFTBITS_SP32; + + int uy = __clc_as_int(y); + int ay = uy & EXSIGNBIT_SP32; + float ya = __clc_as_float(ay); + int sy = uy ^ ay; + int ey = ay >> EXPSHIFTBITS_SP32; + + float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff)); + float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff)); + int c; + int k = ex - ey; + + uint q = 0; + + while (k > 0) { + c = xr >= yr; + q = (q << 1) | c; + xr -= c ? yr : 0.0f; + xr += xr; + --k; + } + + c = xr > yr; + q = (q << 1) | c; + xr -= c ? yr : 0.0f; + + int lt = ex < ey; + + q = lt ? 0 : q; + xr = lt ? xa : xr; + yr = lt ? ya : yr; + + c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1)); + xr -= c ? yr : 0.0f; + q += c; + + float s = __clc_as_float(ey << EXPSHIFTBITS_SP32); + xr *= lt ? 1.0f : s; + + int qsgn = sx == sy ? 1 : -1; + int quot = (q & 0x7f) * qsgn; + + c = ax == ay; + quot = c ? qsgn : quot; + xr = c ? 0.0f : xr; + + xr = __clc_as_float(sx ^ __clc_as_int(xr)); + + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | + ay == 0; + quot = c ? 0 : quot; + xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr; + + *quo = quot; + + return xr; +} + +// remquo signature is special, we don't have macro for this +#define __CLC_VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \ + _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo( \ + TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, \ + __CLC_ADDRESS_SPACE int##VEC_SIZE *quo) { \ + int##HALF_VEC_SIZE lo, hi; \ + TYPE##VEC_SIZE ret; \ + ret.lo = __clc_remquo(x.lo, y.lo, &lo); \ + ret.hi = __clc_remquo(x.hi, y.hi, &hi); \ + (*quo).lo = lo; \ + (*quo).hi = hi; \ + return ret; \ } -__CLC_REMQUO_DEF(private) -__CLC_REMQUO_DEF(local) -__CLC_REMQUO_DEF(global) -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -__CLC_REMQUO_DEF(generic) +#define __CLC_VEC3_REMQUO(TYPE) \ + _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo( \ + TYPE##3 x, TYPE##3 y, __CLC_ADDRESS_SPACE int##3 * quo) { \ + int2 lo; \ + int hi; \ + TYPE##3 ret; \ + ret.s01 = __clc_remquo(x.s01, y.s01, &lo); \ + ret.s2 = __clc_remquo(x.s2, y.s2, &hi); \ + (*quo).s01 = lo; \ + (*quo).s2 = hi; \ + return ret; \ + } +__CLC_VEC_REMQUO(float, 2, ) +__CLC_VEC3_REMQUO(float) +__CLC_VEC_REMQUO(float, 4, 2) +__CLC_VEC_REMQUO(float, 8, 4) +__CLC_VEC_REMQUO(float, 16, 8) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y, + __CLC_ADDRESS_SPACE int *pquo) { + ulong ux = __clc_as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong xsgn = ux ^ ax; + double dx = __clc_as_double(ax); + int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64); + int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64); + xexp1 = xexp < 1 ? xexp1 : xexp; + + ulong uy = __clc_as_ulong(y); + ulong ay = uy & ~SIGNBIT_DP64; + double dy = __clc_as_double(ay); + int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64); + int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64); + yexp1 = yexp < 1 ? yexp1 : yexp; + + int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1; + + // First assume |x| > |y| + + // Set ntimes to the number of times we need to do a + // partial remainder. If the exponent of x is an exact multiple + // of 53 larger than the exponent of y, and the mantissa of x is + // less than the mantissa of y, ntimes will be one too large + // but it doesn't matter - it just means that we'll go round + // the loop below one extra time. + int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); + double w = __clc_ldexp(dy, ntimes * 53); + w = ntimes == 0 ? dy : w; + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; + + // Each time round the loop we compute a partial remainder. + // This is done by subtracting a large multiple of w + // from x each time, where w is a scaled up version of y. + // The subtraction must be performed exactly in quad + // precision, though the result at each stage can + // fit exactly in a double precision number. + int i; + double t, v, p, pp; + + for (i = 0; i < ntimes; i++) { + // Compute integral multiplier + t = __clc_trunc(dx / w); + + // Compute w * t in quad precision + p = w * t; + pp = __clc_fma(w, t, -p); + + // Subtract w * t from dx + v = dx - p; + dx = v + (((dx - v) - p) - pp); + + // If t was one too large, dx will be negative. Add back one w. + dx += dx < 0.0 ? w : 0.0; + + // Scale w down by 2^(-53) for the next iteration + w *= scale; + } + + // One more time + // Variable todd says whether the integer t is odd or not + t = __clc_floor(dx / w); + long lt = (long)t; + int todd = lt & 1; + + p = w * t; + pp = __clc_fma(w, t, -p); + v = dx - p; + dx = v + (((dx - v) - p) - pp); + i = dx < 0.0; + todd ^= i; + dx += i ? w : 0.0; + + lt -= i; + + // At this point, dx lies in the range [0,dy) + + // For the remainder function, we need to adjust dx + // so that it lies in the range (-y/2, y/2] by carefully + // subtracting w (== dy == y) if necessary. The rigmarole + // with todd is to get the correct sign of the result + // when x/y lies exactly half way between two integers, + // when we need to choose the even integer. + + int al = (2.0 * dx > w) | (todd & (2.0 * dx == w)); + double dxl = dx - (al ? w : 0.0); + + int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w)); + double dxg = dx - (ag ? w : 0.0); + + dx = dy < 0x1.0p+1022 ? dxl : dxg; + lt += dy < 0x1.0p+1022 ? al : ag; + int quo = ((int)lt & 0x7f) * qsgn; + + double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx)); + dx = __clc_as_double(ax); + + // Now handle |x| == |y| + int c = dx == dy; + t = __clc_as_double(xsgn); + quo = c ? qsgn : quo; + ret = c ? t : ret; + + // Next, handle |x| < |y| + c = dx < dy; + quo = c ? 0 : quo; + ret = c ? x : ret; + + c &= (yexp < 1023 & 2.0 * dx > dy) | (dx > 0.5 * dy); + quo = c ? qsgn : quo; + // we could use a conversion here instead since qsgn = +-1 + p = qsgn == 1 ? -1.0 : 1.0; + t = __clc_fma(y, p, x); + ret = c ? t : ret; + + // We don't need anything special for |x| == 0 + + // |y| is 0 + c = dy == 0.0; + quo = c ? 0 : quo; + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; + + // y is +-Inf, NaN + c = yexp > BIASEDEMAX_DP64; + quo = c ? 0 : quo; + t = y == y ? x : y; + ret = c ? t : ret; + + // x is +=Inf, NaN + c = xexp > BIASEDEMAX_DP64; + quo = c ? 0 : quo; + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; + + *pquo = quo; + return ret; +} +__CLC_VEC_REMQUO(double, 2, ) +__CLC_VEC3_REMQUO(double) +__CLC_VEC_REMQUO(double, 4, 2) +__CLC_VEC_REMQUO(double, 8, 4) +__CLC_VEC_REMQUO(double, 16, 8) + +#endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y, + __CLC_ADDRESS_SPACE int *pquo) { + return (half)__clc_remquo((float)x, (float)y, pquo); +} +__CLC_VEC_REMQUO(half, 2, ) +__CLC_VEC3_REMQUO(half) +__CLC_VEC_REMQUO(half, 4, 2) +__CLC_VEC_REMQUO(half, 8, 4) +__CLC_VEC_REMQUO(half, 16, 8) + #endif -#endif // __CLC_SCALAR diff --git a/libclc/clc/lib/generic/math/clc_remquo_stret.inc b/libclc/clc/lib/generic/math/clc_remquo_stret.inc deleted file mode 100644 index eecc25525d8d7..0000000000000 --- a/libclc/clc/lib/generic/math/clc_remquo_stret.inc +++ /dev/null @@ -1,158 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifdef __CLC_SCALAR - -#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 -#define __CLC_REMQUO_EVAL_TYPE __CLC_GENTYPE -#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_GENTYPE -#define __CLC_S_EVAL_TYPE __CLC_S_GENTYPE -#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_S_GENTYPE -#elif __CLC_FPSIZE == 16 -#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN -#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN -#define __CLC_S_EVAL_TYPE __CLC_INTN -#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN -#endif - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_REMQUO_RET_GENTYPE -__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y) { - // How many bits of the quotient per iteration - -#if __CLC_FPSIZE == 32 - const __CLC_INTN bits = 12; - const __CLC_GENTYPE max_exp = 0x1.0p+127f; -#elif __CLC_FPSIZE == 64 - const __CLC_INTN bits = 26; - const __CLC_GENTYPE max_exp = 0x1.0p+1023; -#elif __CLC_FPSIZE == 16 - const __CLC_INTN bits = 11; - const __CLC_GENTYPE max_exp = 0x1.0p+15h; -#endif - - // Track low 7 bits of the integral quotient. - __CLC_INTN q7; - - __CLC_REMQUO_EVAL_TYPE ax = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(x)); - __CLC_REMQUO_EVAL_TYPE ay = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(y)); - - __CLC_GENTYPE ret; - - if (ax > ay) { - __CLC_INTN ex, ey; - - __CLC_REMQUO_EVAL_TYPE mx = __clc_frexp(ax, &ex); - --ex; - - __CLC_REMQUO_EVAL_TYPE my = __clc_frexp(ay, &ey); - --ey; - - ax = __clc_ldexp(mx, bits); - ay = __clc_ldexp(my, 1); - - __CLC_INTN nb = ex - ey; - __CLC_REMQUO_EVAL_TYPE ayinv = __clc_recip_fast(ay); - - __CLC_INTN qacc = 0; - - while (nb > bits) { - __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv); - -#if __CLC_FPSIZE == 16 - ax = __clc_mad(-q, ay, ax); -#else - ax = __clc_fma(-q, ay, ax); -#endif - __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0; - __CLC_REMQUO_EVAL_TYPE axp = ax + ay; - ax = clt ? axp : ax; - ax = __clc_ldexp(ax, bits); - - __CLC_INTN iq = __CLC_CONVERT_INTN(q); - iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0; - qacc = (qacc << bits) | iq; - - nb -= bits; - } - - ax = __clc_ldexp(ax, nb - bits + 1); - - __CLC_INTN is_odd; - - // Final iteration - { - __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv); -#if __CLC_FPSIZE == 16 - ax = __clc_mad(-q, ay, ax); -#else - ax = __clc_fma(-q, ay, ax); -#endif - - __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0; - __CLC_REMQUO_EVAL_TYPE axp = ax + ay; - ax = clt ? axp : ax; - __CLC_INTN iq = __CLC_CONVERT_INTN(q); - iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0; - - qacc = (qacc << (nb + 1)) | iq; - is_odd = (iq & 1) != 0; - } - - // Adjust ax so that it is the range (-y/2, y/2] - // We need to choose the even integer when x/y is midway between two - // integers - __CLC_S_EVAL_TYPE aq = ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay) | - (__CLC_CONVERT_S_EVAL_TYPE(is_odd) & - ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax == ay)); - ax = ax - (aq ? ay : (__CLC_REMQUO_EVAL_TYPE)0.0); - - ax = __clc_ldexp(ax, ey); - qacc += aq ? 1 : 0; - - __CLC_S_GENTYPE qneg = __clc_signbit(x) ^ __clc_signbit(y) ? -1 : 0; - q7 = ((qacc & 0x7f) ^ qneg) - qneg; - - ret = __clc_signbit(x) ? -ax : ax; - } else { - __CLC_S_EVAL_TYPE c = (ax > (__CLC_REMQUO_EVAL_TYPE)0.5 * ay); - if (__CLC_FPSIZE != 16) - c |= (ay < max_exp && (__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay); - - __CLC_CHARN qsgn = __CLC_CONVERT_CHARN(__clc_signbit(x) == __clc_signbit(y)) - ? (__CLC_CHARN)1 - : (__CLC_CHARN)-1; - - __CLC_GENTYPE t = __clc_mad(y, -__CLC_CONVERT_GENTYPE(qsgn), x); - ret = c ? t : __clc_flush_if_daz(x); - q7 = c ? qsgn : 0; - - __CLC_GENTYPE zero = __clc_copysign(__CLC_FP_LIT(0.0), x); - ret = ax == ay ? zero : ret; - q7 = ax == ay ? qsgn : q7; - } - - ret = y == __CLC_FP_LIT(0.0) ? __CLC_GENTYPE_NAN : ret; - q7 = y == __CLC_FP_LIT(0.0) ? 0 : q7; - - __CLC_S_GENTYPE finite = !__clc_isnan(y) && __clc_isfinite(x); - - // A defined 0 result for quo with a nan result is an additional OpenCL - // requirement beyond standard C. - __CLC_REMQUO_RET_GENTYPE result; - result.quo = finite ? q7 : 0; - result.rem = finite ? ret : __CLC_GENTYPE_NAN; - - return result; -} - -#undef __CLC_REMQUO_EVAL_TYPE -#undef __CLC_CONVERT_REMQUO_EVAL_TYPE -#undef __CLC_S_EVAL_TYPE -#undef __CLC_CONVERT_S_EVAL_TYPE - -#endif // __CLC_SCALAR >From 64fdb5b8c664cd2b1a2ecc11224627f518297723 Mon Sep 17 00:00:00 2001 From: Karol Herbst <[email protected]> Date: Wed, 1 Apr 2026 02:55:52 +0200 Subject: [PATCH 5/5] Partly revert "libclc: Update trigpi functions (#187579)" The initial commit broke sinpi This partly reverts commit 421bf13e4bf1c19a0cc48fbb0af63b902d3118b1 --- .../clc/math/clc_sincos_helpers_fp64_decl.inc | 5 + libclc/clc/lib/generic/math/clc_sinpi.cl | 8 +- libclc/clc/lib/generic/math/clc_sinpi.inc | 106 +++++++++++++++++- 3 files changed, 115 insertions(+), 4 deletions(-) diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc index 22081d94dee51..d638d72ab8494 100644 --- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc +++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc @@ -22,6 +22,11 @@ _CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y, __CLC_INTN is_odd); +_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x, + __CLC_DOUBLEN xx, + private __CLC_DOUBLEN *sinval, + private __CLC_DOUBLEN *cosval); + _CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small( __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr); diff --git a/libclc/clc/lib/generic/math/clc_sinpi.cl b/libclc/clc/lib/generic/math/clc_sinpi.cl index bf57dc45ccd1e..f9bed7ec78306 100644 --- a/libclc/clc/lib/generic/math/clc_sinpi.cl +++ b/libclc/clc/lib/generic/math/clc_sinpi.cl @@ -6,8 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "clc/math/clc_sincospi.h" -#include "clc/math/clc_sinpi.h" +#include "clc/clc_convert.h" +#include "clc/float/definitions.h" +#include "clc/internal/clc.h" +#include "clc/math/clc_fabs.h" +#include "clc/math/clc_sincos_helpers.h" +#include "clc/math/math.h" #define __CLC_BODY "clc_sinpi.inc" #include "clc/math/gentype.inc" diff --git a/libclc/clc/lib/generic/math/clc_sinpi.inc b/libclc/clc/lib/generic/math/clc_sinpi.inc index a6e5ee73b1650..264609aeaca45 100644 --- a/libclc/clc/lib/generic/math/clc_sinpi.inc +++ b/libclc/clc/lib/generic/math/clc_sinpi.inc @@ -6,7 +6,109 @@ // //===----------------------------------------------------------------------===// +#if __CLC_FPSIZE == 32 + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) { - __CLC_GENTYPE unused_cos; - return __clc_sincospi(x, &unused_cos); + __CLC_INTN ix = __CLC_AS_INTN(x); + __CLC_INTN xsgn = ix & (__CLC_INTN)0x80000000; + ix ^= xsgn; + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_INTN iax = __CLC_CONVERT_INTN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_INTN xodd = + xsgn ^ ((iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0); + + // Initialize with return for +-Inf and NaN + __CLC_INTN ir = QNANBITPATT_SP32; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < PINFBITPATT_SP32 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0f - r; + __CLC_INTN e = 0; + + // r <= 0.75 + __CLC_INTN c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + + // 0 < r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 0 : e; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval); + __CLC_INTN jr = xodd ^ __CLC_AS_INTN(e != 0 ? cosval : sinval); + + ir = ix < 0x4b000000 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); } + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) { + __CLC_LONGN ix = __CLC_AS_LONGN(x); + __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L; + ix ^= xsgn; + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_LONGN xodd = + xsgn ^ + ((iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L); + + // Initialize with return for +-Inf and NaN + __CLC_LONGN ir = QNANBITPATT_DP64; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < PINFBITPATT_DP64 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0 - r; + __CLC_LONGN e = 0; + + // r <= 0.75 + __CLC_LONGN c = r <= 0.75; + __CLC_GENTYPE t = r - 0.5; + a = c ? t : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 0 : e; + + __CLC_GENTYPE api = a * M_PI; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(api, 0.0, &sinval, &cosval); + __CLC_LONGN jr = xodd ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval); + + ir = absx < 0x1.0p+52 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_sinpi(__CLC_CONVERT_FLOATN(x))); +} + +#endif _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
