https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/188186
This was originally ported from rocm device libs in bf9f76fbe0fb8b8af3377d2c6ce7d7cc290894ad, merge in more recent changes. >From d710fbaebe628cb1c9a4ac0d14f88d742c03527c Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Thu, 19 Mar 2026 14:07:04 +0100 Subject: [PATCH] libclc: Update log1p This was originally ported from rocm device libs in bf9f76fbe0fb8b8af3377d2c6ce7d7cc290894ad, merge in more recent changes. --- libclc/clc/include/clc/math/clc_ep_decl.inc | 4 +- libclc/clc/lib/generic/math/clc_ep.inc | 60 +++++++- libclc/clc/lib/generic/math/clc_log1p.cl | 4 +- libclc/clc/lib/generic/math/clc_log1p.inc | 154 +++----------------- 4 files changed, 82 insertions(+), 140 deletions(-) diff --git a/libclc/clc/include/clc/math/clc_ep_decl.inc b/libclc/clc/include/clc/math/clc_ep_decl.inc index 9bb06cba91a69..5f83d4f6af1dd 100644 --- a/libclc/clc/include/clc/math/clc_ep_decl.inc +++ b/libclc/clc/include/clc/math/clc_ep_decl.inc @@ -82,7 +82,7 @@ _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_fast_sub(__CLC_EP_PAIR a, __CLC_EP_PAIR b); _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ldexp(__CLC_EP_PAIR a, - int e); + __CLC_INTN e); _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_mul(__CLC_EP_PAIR a, __CLC_GENTYPE b); @@ -135,4 +135,6 @@ _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a); #if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR a); _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a); +_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_ln_hi(__CLC_EP_PAIR a, + __CLC_INTN ea); #endif diff --git a/libclc/clc/lib/generic/math/clc_ep.inc b/libclc/clc/lib/generic/math/clc_ep.inc index fc45c892b6c80..8f844644c43fd 100644 --- a/libclc/clc/lib/generic/math/clc_ep.inc +++ b/libclc/clc/lib/generic/math/clc_ep.inc @@ -245,7 +245,7 @@ __clc_ep_fast_sub(__CLC_EP_PAIR a, __CLC_EP_PAIR b) { } _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ldexp(__CLC_EP_PAIR a, - int e) { + __CLC_INTN e) { return __clc_ep_make_pair(__clc_ldexp(a.hi, e), __clc_ldexp(a.lo, e)); } @@ -469,6 +469,28 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a) { return r; } +_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_FLOATN __clc_ep_ln_hi(__CLC_EP_PAIR a, + __CLC_INTN ea) { + __CLC_INTN a_hi_exp; + __CLC_FLOATN a_hi_m = __clc_frexp(a.hi, &a_hi_exp); + __CLC_INTN b = a_hi_m < (2.0f / 3.0f); + __CLC_INTN e = a_hi_exp - b; + __CLC_EP_PAIR m = __clc_ep_ldexp(a, -e); + __CLC_EP_PAIR x = + __clc_ep_div(__clc_ep_fast_add(-1.0f, m), __clc_ep_fast_add(1.0f, m)); + __CLC_FLOATN s = x.hi * x.hi; + __CLC_FLOATN p = __clc_mad(s, __clc_mad(s, 0x1.36db58p-2f, 0x1.992b46p-2f), + 0x1.5555b4p-1f); + + const __CLC_EP_PAIR ln2 = __clc_ep_make_pair(__CLC_FP_LIT(0x1.62e430p-1), + __CLC_FP_LIT(-0x1.05c610p-29)); + + __CLC_EP_PAIR r = + __clc_ep_add(__clc_ep_mul(ln2, __CLC_CONVERT_FLOATN(e + ea)), + __clc_ep_fast_add(__clc_ep_ldexp(x, 1), s * x.hi * p)); + return r.hi; +} + #elif __CLC_FPSIZE == 64 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) { @@ -509,6 +531,42 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_DOUBLEN a) { return r; } +_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_DOUBLEN __clc_ep_ln_hi(__CLC_EP_PAIR a, + __CLC_INTN ea) { + __CLC_INTN a_hi_exp; + __CLC_DOUBLEN m_hi = __clc_frexp(a.hi, &a_hi_exp); + + __CLC_LONGN b = m_hi < (2.0 / 3.0); + __CLC_INTN e = a_hi_exp - __CLC_CONVERT_INTN(b); + __CLC_EP_PAIR m = __clc_ep_ldexp(a, -e); + __CLC_EP_PAIR x = + __clc_ep_div(__clc_ep_fast_add(-1.0, m), __clc_ep_fast_add(1.0, m)); + __CLC_DOUBLEN s = x.hi * x.hi; + + const __CLC_DOUBLEN ln_c1 = 0x1.3ab76bf559e2bp-3; + const __CLC_DOUBLEN ln_c2 = 0x1.385386b47b09ap-3; + const __CLC_DOUBLEN ln_c3 = 0x1.7474dd7f4df2ep-3; + const __CLC_DOUBLEN ln_c4 = 0x1.c71c016291751p-3; + const __CLC_DOUBLEN ln_c5 = 0x1.249249b27acf1p-2; + const __CLC_DOUBLEN ln_c6 = 0x1.99999998ef7b6p-2; + const __CLC_DOUBLEN ln_c7 = 0x1.5555555555780p-1; + + __CLC_DOUBLEN q1 = __clc_mad(s, ln_c1, ln_c2); + __CLC_DOUBLEN q2 = __clc_mad(s, q1, ln_c3); + __CLC_DOUBLEN q3 = __clc_mad(s, q2, ln_c4); + __CLC_DOUBLEN q4 = __clc_mad(s, q3, ln_c5); + __CLC_DOUBLEN q5 = __clc_mad(s, q4, ln_c6); + __CLC_DOUBLEN p = __clc_mad(s, q5, ln_c7); + + const __CLC_EP_PAIR ln2 = __clc_ep_make_pair( + __CLC_FP_LIT(0x1.62e42fefa39efp-1), __CLC_FP_LIT(0x1.abc9e3b39803fp-56)); + + __CLC_EP_PAIR r = + __clc_ep_add(__clc_ep_mul(ln2, __CLC_CONVERT_DOUBLEN(e + ea)), + __clc_ep_fast_add(__clc_ep_ldexp(x, 1), s * x.hi * p)); + return r.hi; +} + #endif #undef __CLC_EP_USE_FMA diff --git a/libclc/clc/lib/generic/math/clc_log1p.cl b/libclc/clc/lib/generic/math/clc_log1p.cl index 6c32c1ffc361d..557bd0d1441dc 100644 --- a/libclc/clc/lib/generic/math/clc_log1p.cl +++ b/libclc/clc/lib/generic/math/clc_log1p.cl @@ -10,9 +10,11 @@ #include "clc/float/definitions.h" #include "clc/internal/clc.h" #include "clc/math/clc_fma.h" +#include "clc/math/clc_ep.h" #include "clc/math/clc_mad.h" +#include "clc/math/clc_fabs.h" +#include "clc/math/clc_log2_fast.h" #include "clc/math/math.h" -#include "clc/math/tables.h" #include "clc/relational/clc_isinf.h" #define __CLC_BODY "clc_log1p.inc" diff --git a/libclc/clc/lib/generic/math/clc_log1p.inc b/libclc/clc/lib/generic/math/clc_log1p.inc index 2dd616818c715..c86b5afa6cb75 100644 --- a/libclc/clc/lib/generic/math/clc_log1p.inc +++ b/libclc/clc/lib/generic/math/clc_log1p.inc @@ -19,152 +19,32 @@ #if __CLC_FPSIZE == 32 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_log1p(__CLC_GENTYPE x) { - __CLC_GENTYPE w = x; - __CLC_UINTN ux = __CLC_AS_UINTN(x); - __CLC_UINTN ax = ux & EXSIGNBIT_SP32; - - // |x| < 2^-4 - __CLC_GENTYPE u2 = MATH_DIVIDE(x, 2.0f + x); - __CLC_GENTYPE u = u2 + u2; - __CLC_GENTYPE v = u * u; - // 2/(5 * 2^5), 2/(3 * 2^3) - __CLC_GENTYPE zsmall = - __clc_mad(-u2, x, __clc_mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + - x; - - // |x| >= 2^-4 - ux = __CLC_AS_UINTN(x + 1.0f); - - __CLC_INTN m = __CLC_AS_INTN((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32; - __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m); - __CLC_UINTN indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1); - __CLC_GENTYPE F = __CLC_AS_GENTYPE(indx | 0x3f000000); - - // x > 2^24 - __CLC_GENTYPE fg24 = F - __CLC_AS_GENTYPE(0x3f000000 | (ux & MANTBITS_SP32)); - - // x <= 2^24 - __CLC_UINTN xhi = ux & 0xffff8000; - __CLC_GENTYPE xh = __CLC_AS_GENTYPE(xhi); - __CLC_GENTYPE xt = (1.0f - xh) + w; - __CLC_UINTN xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000; - xt = xt * __CLC_AS_GENTYPE(xnm) * 0.5f; - __CLC_GENTYPE fl24 = - F - __CLC_AS_GENTYPE(0x3f000000 | (xhi & MANTBITS_SP32)) - xt; - - __CLC_GENTYPE f = mf > 24.0f ? fg24 : fl24; - - indx = indx >> 16; - __CLC_GENTYPE r = f * __CLC_USE_TABLE(log_inv_tbl, __CLC_CONVERT_INTN(indx)); - - // 1/3, 1/2 - __CLC_GENTYPE poly = - __clc_mad(__clc_mad(r, 0x1.555556p-2f, 0x1.0p-1f), r * r, r); - - const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 - const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 - - __CLC_GENTYPE tv0 = __CLC_USE_TABLE(loge_tbl_lo, __CLC_AS_INTN(indx)); - __CLC_GENTYPE tv1 = __CLC_USE_TABLE(loge_tbl_hi, __CLC_AS_INTN(indx)); - __CLC_GENTYPE z1 = __clc_mad(mf, LOG2_HEAD, tv0); - __CLC_GENTYPE z2 = __clc_mad(mf, LOG2_TAIL, -poly) + tv1; - __CLC_GENTYPE z = z1 + z2; - - z = ax < 0x3d800000U ? zsmall : z; - - // Edge cases - z = ax >= PINFBITPATT_SP32 ? w : z; - z = w < -1.0f ? __CLC_GENTYPE_NAN : z; - z = w == -1.0f ? __CLC_AS_GENTYPE((__CLC_UINTN)NINFBITPATT_SP32) : z; - // Fix subnormals - z = ax < 0x33800000 ? x : z; - - return z; +_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_FLOATN __clc_log1p(__CLC_FLOATN x) { + __CLC_FLOATN z = __clc_ep_ln_hi(__clc_ep_add(__CLC_FP_LIT(1.0), x), 0); + z = x == __CLC_GENTYPE_INF ? x : z; + z = x < -1.0f ? __CLC_GENTYPE_NAN : z; + z = x == -1.0f ? -__CLC_GENTYPE_INF : z; + return __clc_fabs(x) < 0x1.0p-24f ? x : z; } #elif __CLC_FPSIZE == 64 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_log1p(__CLC_GENTYPE x) { - // Process Inside the threshold now - __CLC_ULONGN ux = __CLC_AS_ULONGN((__CLC_GENTYPE)1.0 + x); - __CLC_INTN xexp = - __CLC_CONVERT_INTN((ux >> EXPSHIFTBITS_DP64) & 0x7ff) - EXPBIAS_DP64; - __CLC_GENTYPE f = - __CLC_AS_GENTYPE((__CLC_ULONGN)ONEEXPBITS_DP64 | (ux & MANTBITS_DP64)); +_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_DOUBLEN __clc_log1p(__CLC_DOUBLEN x) { + __CLC_GENTYPE z = __clc_ep_ln_hi(__clc_ep_add(1.0, x), 0); - __CLC_INTN j = __CLC_CONVERT_INTN(ux >> 45); - j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1); - __CLC_GENTYPE f1 = __CLC_CONVERT_GENTYPE(j) * 0x1.0p-6; - j -= 64; - - __CLC_GENTYPE f2temp = f - f1; - __CLC_GENTYPE m2 = - __CLC_AS_GENTYPE(__CLC_CONVERT_ULONGN(0x3ff - xexp) << EXPSHIFTBITS_DP64); - __CLC_GENTYPE f2l = __clc_fma(m2, x, m2 - f1); - __CLC_GENTYPE f2g = __clc_fma(m2, x, -f1) + m2; - __CLC_GENTYPE f2 = - __CLC_CONVERT_LONGN(xexp <= MANTLENGTH_DP64 - 1) ? f2l : f2g; - f2 = __CLC_CONVERT_LONGN(xexp <= -2 || (xexp >= MANTLENGTH_DP64 + 8)) ? f2temp - : f2; - - __CLC_GENTYPE z1 = __CLC_USE_TABLE(ln_tbl_lo, j); - __CLC_GENTYPE q = __CLC_USE_TABLE(ln_tbl_hi, j); - - __CLC_GENTYPE u = MATH_DIVIDE(f2, __clc_fma(0.5, f2, f1)); - __CLC_GENTYPE v = u * u; - - __CLC_GENTYPE poly = v * __clc_fma(v, - __clc_fma(v, 2.23219810758559851206e-03, - 1.24999999978138668903e-02), - 8.33333333333333593622e-02); - - // log2_lead and log2_tail sum to an extra-precise version of log(2) - // 0x3fe62e42e0000000 - const __CLC_GENTYPE log2_lead = 6.93147122859954833984e-01; - // 0x3e6efa39ef35793c - const __CLC_GENTYPE log2_tail = 5.76999904754328540596e-08; - - __CLC_GENTYPE z2 = q + __clc_fma(u, poly, u); - __CLC_GENTYPE dxexp = __CLC_CONVERT_GENTYPE(xexp); - __CLC_GENTYPE r1 = __clc_fma(dxexp, log2_lead, z1); - __CLC_GENTYPE r2 = __clc_fma(dxexp, log2_tail, z2); - __CLC_GENTYPE result1 = r1 + r2; - - // Process Outside the threshold now - __CLC_GENTYPE r = x; - u = r / (2.0 + r); - __CLC_GENTYPE correction = r * u; - u = u + u; - v = u * u; - r1 = r; - - poly = __clc_fma(v, - __clc_fma(v, - __clc_fma(v, 4.34887777707614552256e-04, - 2.23213998791944806202e-03), - 1.25000000037717509602e-02), - 8.33333333333317923934e-02); - - r2 = __clc_fma(u * v, poly, -correction); - - // The values exp(-1/16)-1 and exp(1/16)-1 - const __CLC_GENTYPE log1p_thresh1 = -0x1.f0540438fd5c3p-5; - const __CLC_GENTYPE log1p_thresh2 = 0x1.082b577d34ed8p-4; - __CLC_GENTYPE result2 = r1 + r2; - result2 = x < log1p_thresh1 || x > log1p_thresh2 ? result1 : result2; - - result2 = __clc_isinf(x) ? x : result2; - result2 = x < -1.0 ? __CLC_GENTYPE_NAN : result2; - result2 = - x == -1.0 ? __CLC_AS_GENTYPE((__CLC_ULONGN)NINFBITPATT_DP64) : result2; - return result2; + z = x == __CLC_GENTYPE_INF ? x : z; + z = x < -1.0 ? __CLC_GENTYPE_NAN : z; + z = x == -1.0 ? -__CLC_GENTYPE_INF : z; + return z; } + #elif __CLC_FPSIZE == 16 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_log1p(__CLC_GENTYPE x) { - return __CLC_CONVERT_GENTYPE(__clc_log1p(__CLC_CONVERT_FLOATN(x))); +_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_HALFN __clc_log1p(__CLC_HALFN x) { + __CLC_HALFN ret = __CLC_CONVERT_HALFN(__clc_log2_fast(__CLC_CONVERT_FLOATN(x) + 1.0f) * 0x1.62e430p-1f); + __CLC_HALFN p = __clc_mad(x, x * __clc_mad(x, 0x1.555556p-2h, -0.5h), x); + return __clc_fabs(x) < 0x1.0p-6h ? p : ret; } #endif _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
