llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

Most of of this was originally ported from rocm
device libs in 2e6ff0c66e180998425776a27579559dc099732f. Merge
in more recent changes.

---

Patch is 37.96 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/187455.diff


15 Files Affected:

- (added) libclc/clc/include/clc/math/clc_get_twobypi_bits.h (+25) 
- (added) libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc (+3) 
- (modified) libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc 
(+21-8) 
- (modified) libclc/clc/include/clc/math/tables.h (+1-1) 
- (modified) libclc/clc/lib/amdgpu/CMakeLists.txt (+1) 
- (added) libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl (+21) 
- (modified) libclc/clc/lib/generic/CMakeLists.txt (+1) 
- (modified) libclc/clc/lib/generic/math/clc_cos.inc (+9-20) 
- (added) libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl (+17) 
- (added) libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc (+40) 
- (modified) libclc/clc/lib/generic/math/clc_sin.inc (+12-20) 
- (modified) libclc/clc/lib/generic/math/clc_sincos_helpers.cl (+5-6) 
- (modified) libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc (+163-210) 
- (modified) libclc/clc/lib/generic/math/clc_tables.cl (+50-53) 
- (modified) libclc/clc/lib/generic/math/clc_tan.inc (+10-22) 


``````````diff
diff --git a/libclc/clc/include/clc/math/clc_get_twobypi_bits.h 
b/libclc/clc/include/clc/math/clc_get_twobypi_bits.h
new file mode 100644
index 0000000000000..ace433c8c05ca
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_get_twobypi_bits.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function for trigonometric reductions to extract bits out of 2/pi
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_GET_TWOBYPI_BITS_H__
+#define __CLC_MATH_CLC_GET_TWOBYPI_BITS_H__
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_BODY <clc/math/clc_get_twobypi_bits_decl.inc>
+#define __CLC_FUNCTION __clc_get_twobypi_bits
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+#undef __CLC_DOUBLE_ONLY
+
+#endif // __CLC_MATH_CLC_GET_TWOBYPI_BITS_H__
diff --git a/libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc 
b/libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc
new file mode 100644
index 0000000000000..6ed2379ecdf94
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc
@@ -0,0 +1,3 @@
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_get_twobypi_bits(__CLC_GENTYPE x, __CLC_INTN segment);
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc 
b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
index 15934cab32751..bbae56c69c937 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
@@ -6,6 +6,19 @@
 //
 
//===----------------------------------------------------------------------===//
 
+typedef struct __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE) {
+  __CLC_GENTYPE sin, cos;
+} __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE);
+
+#define __CLC_SINCOS_RET_GENTYPE __CLC_XCONCAT(__clc_sincos_ret_, 
__CLC_GENTYPE)
+
+_CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
+__clc_sincos_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y);
+
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x,
+                                                            __CLC_DOUBLEN y,
+                                                            __CLC_INTN is_odd);
+
 _CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
                                                 __CLC_DOUBLEN xx,
                                                 private __CLC_DOUBLEN *sinval,
@@ -15,12 +28,12 @@ _CLC_DECL _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN 
x, __CLC_DOUBLEN xx,
                                              private __CLC_DOUBLEN *leadval,
                                              private __CLC_DOUBLEN *tailval);
 
-_CLC_DECL _CLC_OVERLOAD void
-__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
-                             private __CLC_DOUBLEN *rr,
-                             private __CLC_INTN *regn);
+_CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
+    __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_large(
+    __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr);
 
-_CLC_DECL _CLC_OVERLOAD void
-__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
-                            private __CLC_DOUBLEN *rr,
-                            private __CLC_INTN *regn);
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_DOUBLEN *r,
+                                                      private __CLC_DOUBLEN 
*rr,
+                                                      __CLC_DOUBLEN x);
diff --git a/libclc/clc/include/clc/math/tables.h 
b/libclc/clc/include/clc/math/tables.h
index 12361a30357ac..66429aaf8344b 100644
--- a/libclc/clc/include/clc/math/tables.h
+++ b/libclc/clc/include/clc/math/tables.h
@@ -74,7 +74,7 @@ __CLC_TABLE_FUNCTION_DECL_VEC(float, cbrt_tbl_head);
 __CLC_TABLE_FUNCTION_DECL_VEC(float, cbrt_tbl_tail);
 __CLC_TABLE_FUNCTION_DECL_VEC(float, sinhcosh_tbl_head);
 __CLC_TABLE_FUNCTION_DECL_VEC(float, sinhcosh_tbl_tail);
-__CLC_TABLE_FUNCTION_DECL_VEC(ulong, pibits_tbl);
+__CLC_TABLE_FUNCTION_DECL_VEC(uint, two_by_pi_bits_tbl);
 
 #ifdef cl_khr_fp64
 
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt 
b/libclc/clc/lib/amdgpu/CMakeLists.txt
index daccc00b841b3..a2a30c2941d6b 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -15,6 +15,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   math/clc_half_recip.cl
   math/clc_half_rsqrt.cl
   math/clc_half_sqrt.cl
+  math/clc_get_twobypi_bits.cl
   math/clc_ldexp.cl
   math/clc_log2_fast.cl
   math/clc_native_exp.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl 
b/libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl
new file mode 100644
index 0000000000000..5de3b357f58d4
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_get_twobypi_bits.h"
+
+_CLC_OVERLOAD _CLC_DEF double __clc_get_twobypi_bits(double x, int y) {
+  return __builtin_amdgcn_trig_preop(x, y);
+}
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_FUNCTION __clc_get_twobypi_bits
+#define __CLC_IMPL_FUNCTION(x, y) __builtin_amdgcn_trig_preop(x, y)
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/CMakeLists.txt 
b/libclc/clc/lib/generic/CMakeLists.txt
index f9eb15a0aafda..af6a556d33d1a 100644
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@@ -94,6 +94,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
   math/clc_fract.cl
   math/clc_frexp.cl
   math/clc_frexp_exp.cl
+  math/clc_get_twobypi_bits.cl
   math/clc_half_cos.cl
   math/clc_half_divide.cl
   math/clc_half_exp.cl
diff --git a/libclc/clc/lib/generic/math/clc_cos.inc 
b/libclc/clc/lib/generic/math/clc_cos.inc
index bd5a8679505e2..8f6d2391e50c0 100644
--- a/libclc/clc/lib/generic/math/clc_cos.inc
+++ b/libclc/clc/lib/generic/math/clc_cos.inc
@@ -30,30 +30,19 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE 
__clc_cos(__CLC_GENTYPE x) {
 #elif __CLC_FPSIZE == 64
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
-  x = __clc_select(x, __CLC_GENTYPE_NAN,
-                   __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
+  x = __clc_select(x, __CLC_GENTYPE_NAN, __CLC_CONVERT_LONGN(__clc_isinf(x)));
 
-  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_DOUBLEN absx = __clc_fabs(x);
 
-  __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+  __CLC_DOUBLEN reduced_lo, reduced_hi;
+  __CLC_INTN regn = __clc_argReductionS(&reduced_lo, &reduced_hi, absx);
 
-  __CLC_INTN regn_m, regn_l;
-  __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+  __CLC_SINCOS_RET_GENTYPE eval =
+      __clc_sincos_reduced_eval(reduced_hi, reduced_lo);
 
-  __clc_remainder_piby2_medium(absx, &r_m, &rr_m, &regn_m);
-  __clc_remainder_piby2_large(absx, &r_l, &rr_l, &regn_l);
-
-  __CLC_GENTYPE r = is_medium ? r_m : r_l;
-  __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
-  __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
-
-  __CLC_GENTYPE sinval, cosval;
-  __clc_sincos_piby4(r, rr, &sinval, &cosval);
-  sinval = -sinval;
-
-  __CLC_LONGN c =
-      __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? sinval : 
cosval);
-  c ^= __CLC_CONVERT_BIT_INTN(regn > 1) << 63;
+  __CLC_ULONGN c = __CLC_AS_ULONGN(
+      __CLC_CONVERT_LONGN((regn & 1) != 0) ? -eval.sin : eval.cos);
+  c ^= __CLC_CONVERT_LONGN(regn > 1) ? SIGNBIT_DP64 : 0u;
 
   return __CLC_AS_GENTYPE(c);
 }
diff --git a/libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl 
b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl
new file mode 100644
index 0000000000000..5dfca5dac4d1c
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/clc_convert.h"
+#include "clc/integer/clc_clz.h"
+#include "clc/math/clc_get_twobypi_bits.h"
+#include "clc/math/tables.h"
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_FUNCTION __clc_get_twobypi_bits
+#define __CLC_BODY <clc_get_twobypi_bits.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc 
b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc
new file mode 100644
index 0000000000000..1c066ad11a58a
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_get_twobypi_bits(__CLC_DOUBLEN x,
+                                                            __CLC_INTN index) {
+  const __CLC_INTN e_clamp = 1077;
+  __CLC_INTN e = __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> 52);
+  __CLC_INTN shift = e > e_clamp ? e - e_clamp : 0;
+  __CLC_INTN scale = e >= 0x7b0 ? 128 : 0;
+
+  __CLC_INTN start = shift + index * 53;
+
+  __CLC_INTN i = start >> 5;
+  __CLC_INTN b = start & 0x1f;
+
+  __CLC_UINTN w2 = __CLC_USE_TABLE(two_by_pi_bits_tbl, i);
+  __CLC_UINTN w1 = __CLC_USE_TABLE(two_by_pi_bits_tbl, i + 1);
+  __CLC_UINTN w0 = __CLC_USE_TABLE(two_by_pi_bits_tbl, i + 2);
+
+  __CLC_UINTN t = (w2 << b) | (w1 >> (32 - b));
+  w2 = b != 0 ? t : w2;
+
+  t = (w1 << b) | (w0 >> (32 - b));
+  w1 = b != 0 ? t : w1;
+  w1 &= 0xfffff800;
+
+  __CLC_INTN z = __CLC_CONVERT_INTN(__clc_clz(w2));
+  b = 11 - z;
+  w1 = (w1 >> b) | (w2 << (32 - b));
+  w2 >>= b;
+
+  return __CLC_AS_DOUBLEN(
+      (__CLC_CONVERT_ULONGN(1022 + scale - start - z) << 52) |
+      (__CLC_CONVERT_ULONGN(w2 & 0x000fffff) << 32) | 
__CLC_CONVERT_ULONGN(w1));
+}
diff --git a/libclc/clc/lib/generic/math/clc_sin.inc 
b/libclc/clc/lib/generic/math/clc_sin.inc
index be23f125a060d..3e839fdf43f17 100644
--- a/libclc/clc/lib/generic/math/clc_sin.inc
+++ b/libclc/clc/lib/generic/math/clc_sin.inc
@@ -32,31 +32,23 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE 
__clc_sin(__CLC_GENTYPE x) {
 #elif __CLC_FPSIZE == 64
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
-  x = __clc_select(x, __CLC_GENTYPE_NAN,
-                   __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
+  x = __clc_select(x, __CLC_GENTYPE_NAN, __CLC_CONVERT_LONGN(__clc_isinf(x)));
 
-  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_DOUBLEN absx = __clc_fabs(x);
 
-  __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+  __CLC_DOUBLEN reduced_lo, reduced_hi;
+  __CLC_INTN regn = __clc_argReductionS(&reduced_lo, &reduced_hi, absx);
 
-  __CLC_INTN regn_m, regn_l;
-  __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+  __CLC_SINCOS_RET_GENTYPE eval =
+      __clc_sincos_reduced_eval(reduced_hi, reduced_lo);
 
-  __clc_remainder_piby2_medium(absx, &r_m, &rr_m, &regn_m);
-  __clc_remainder_piby2_large(absx, &r_l, &rr_l, &regn_l);
+  __CLC_DOUBLEN s = __CLC_CONVERT_LONGN((regn & 1) == 0) ? eval.sin : eval.cos;
 
-  __CLC_GENTYPE r = is_medium ? r_m : r_l;
-  __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
-  __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
-
-  __CLC_GENTYPE sinval, cosval;
-  __clc_sincos_piby4(r, rr, &sinval, &cosval);
-
-  __CLC_LONGN s =
-      __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? cosval : 
sinval);
-
-  s ^= (__CLC_CONVERT_BIT_INTN(regn > 1) << 63) ^
-       (__CLC_CONVERT_BIT_INTN(x < 0.0) << 63);
+  s = __CLC_AS_DOUBLEN(__CLC_AS_ULONGN(s) ^
+                       (__CLC_CONVERT_LONGN(regn > 1)
+                            ? (__CLC_ULONGN)SIGNBIT_DP64
+                            : (__CLC_ULONGN)0) ^
+                       (__CLC_AS_ULONGN(x) ^ __CLC_AS_ULONGN(absx)));
 
   return __CLC_AS_GENTYPE(s);
 }
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl 
b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
index 60880c7fae298..8c899c08d57a3 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@@ -9,6 +9,7 @@
 #include "clc/clc_convert.h"
 #include "clc/integer/clc_clz.h"
 #include "clc/internal/clc.h"
+#include "clc/math/clc_floor.h"
 #include "clc/math/clc_fma.h"
 #include "clc/math/clc_frexp.h"
 #include "clc/math/clc_ldexp.h"
@@ -18,6 +19,8 @@
 #include "clc/math/clc_sincos_helpers.h"
 #include "clc/math/clc_trunc.h"
 #include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isnan.h"
 
 #define bitalign(hi, lo, shift) __builtin_elementwise_fshr(hi, lo, shift)
 
@@ -30,14 +33,10 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
+#include "clc/math/clc_ep.h"
 #include "clc/math/clc_fract.h"
+#include "clc/math/clc_get_twobypi_bits.h"
 #include "clc/math/tables.h"
-#include "clc/shared/clc_max.h"
-
-#define bytealign(src0, src1, src2)                                            
\
-  (__CLC_CONVERT_UINTN(                                                        
\
-      ((__CLC_CONVERT_LONGN((src0)) << 32) | __CLC_CONVERT_LONGN((src1))) >>   
\
-      (((src2) & 3) * 8)))
 
 #define __CLC_DOUBLE_ONLY
 #define __CLC_BODY "clc_sincos_helpers_fp64.inc"
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc 
b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
index ae97b7963f7b3..cdb947c30d49c 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -8,6 +8,59 @@
 
 #pragma OPENCL FP_CONTRACT OFF
 
+_CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
+__clc_sincos_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y) {
+  const __CLC_DOUBLEN S0 = -0x1.5555555555555p-3;
+  const __CLC_DOUBLEN S1 = 0x1.1111111110bb3p-7;
+  const __CLC_DOUBLEN S2 = -0x1.a01a019e83e5cp-13;
+  const __CLC_DOUBLEN S3 = 0x1.71de3796cde01p-19;
+  const __CLC_DOUBLEN S4 = -0x1.ae600b42fdfa7p-26;
+  const __CLC_DOUBLEN S5 = 0x1.5e0b2f9a43bb8p-33;
+
+  const __CLC_DOUBLEN C0 = 0x1.5555555555555p-5;
+  const __CLC_DOUBLEN C1 = -0x1.6c16c16c16967p-10;
+  const __CLC_DOUBLEN C2 = 0x1.a01a019f4ec90p-16;
+  const __CLC_DOUBLEN C3 = -0x1.27e4fa17f65f6p-22;
+  const __CLC_DOUBLEN C4 = 0x1.1eeb69037ab78p-29;
+  const __CLC_DOUBLEN C5 = -0x1.907db46cc5e42p-37;
+
+  __CLC_DOUBLEN x2 = x * x;
+  __CLC_DOUBLEN x3 = x * x2;
+  __CLC_DOUBLEN r = 0.5 * x2;
+  __CLC_DOUBLEN t = 1.0 - r;
+  __CLC_DOUBLEN u = 1.0 - t;
+  __CLC_DOUBLEN v = u - r;
+
+  __CLC_DOUBLEN cxy = t + __clc_mad(x2 * x2, __clc_mad(x2, __clc_mad(x2, 
__clc_mad(x2, __clc_mad(x2, __clc_mad(x2, C5, C4), C3), C2), C1), C0), 
__clc_mad(x, -y, v));
+  __CLC_DOUBLEN sxy = __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, 
S5, S4), S3), S2), S1);
+  sxy = x - __clc_mad(-x3, S0, __clc_mad(x2, __clc_mad(-x3, sxy, 
__CLC_FP_LIT(0.5) * y), -y));
+
+  __CLC_SINCOS_RET_GENTYPE ret;
+  ret.cos = cxy;
+  ret.sin = sxy;
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x,
+                                                            __CLC_DOUBLEN xx,
+                                                            __CLC_INTN is_odd) 
{
+  __CLC_DOUBLEN s = __clc_ep_sqr(__clc_ep_make_pair(x, xx)).hi;
+  __CLC_DOUBLEN p = s * __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+                 __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+                 __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+                 __clc_mad(s,
+                     0x1.5e089c751c08cp-16, -0x1.78809a9a29f71p-15),
+                     0x1.7746f90a8aaep-14), -0x1.bb44da6fbf144p-16),
+                     0x1.1e634a7943acfp-13), 0x1.d250fdeb68febp-13),
+                     0x1.37fd9b58c4d95p-11), 0x1.7d5af15120e2cp-10),
+                     0x1.d6d93e09491dfp-9), 0x1.226e12033784dp-7),
+                     0x1.664f49ac36ae2p-6), 0x1.ba1ba1b451c21p-5),
+                     0x1.11111111185b7p-3), 0x1.55555555554eep-2);
+  __CLC_EP_PAIR t = __clc_ep_fast_add(__clc_ep_make_pair(x, xx), 
__clc_ep_mul(x, p));
+  __CLC_EP_PAIR tr = __clc_ep_fast_recip(t);
+  return __CLC_CONVERT_LONGN(is_odd) ? -tr.hi : t.hi;
+}
+
 _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
                                                __CLC_DOUBLEN xx,
                                                private __CLC_DOUBLEN *sinval,
@@ -49,8 +102,8 @@ _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN 
x,
 
   __CLC_DOUBLEN x2 = x * x;
   __CLC_DOUBLEN x3 = x2 * x;
-  __CLC_DOUBLEN r = (__CLC_DOUBLEN)0.5 * x2;
-  __CLC_DOUBLEN t = (__CLC_DOUBLEN)1.0 - r;
+  __CLC_DOUBLEN r = __CLC_FP_LIT(0.5) * x2;
+  __CLC_DOUBLEN t = __CLC_FP_LIT(1.0) - r;
 
   __CLC_DOUBLEN sp = __clc_fma(
       __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, 
sc2);
@@ -62,10 +115,11 @@ _CLC_DEF _CLC_OVERLOAD void 
__clc_sincos_piby4(__CLC_DOUBLEN x,
                                               x2, cc3),
                                     x2, cc2),
                           x2, cc1),
-                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
+                x2 * x2, __clc_fma(x, xx, (__CLC_FP_LIT(1.0) - t) - r));
 
-  *sinval =
-      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, 
-xx));
+  *sinval = x - __clc_fma(-x3, sc1,
+                          __clc_fma(__clc_fma(-x3, sp, __CLC_FP_LIT(0.5) * xx),
+                                    x2, -xx));
   *cosval = cp;
 }
 
@@ -131,229 +185,128 @@ _CLC_DEF _CLC_OVERLOAD void 
__clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
   *tailval = c ? tptr : tpr;
 }
 
-// Reduction for medium sized arguments
-_CLC_DEF _CLC_OVERLOAD void
-__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
-                             private __CLC_DOUBLEN *rr,
-                             private __CLC_INTN *regn) {
+// Reduction for small sized arguments
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
+    __CLC_DOUBLEN x, private __CLC_DOUBLEN *rh, private __CLC_DOUBLEN *rt) {
   // How many pi/2 is x a multiple of?
-  const __CLC_DOUBLEN two_by_pi = 0x1.45f306dc9c883p-1;
-  __CLC_DOUBLEN dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
+  const __CLC_DOUBLEN twobypi = 0x1.45f306dc9c883p-1;
+  const __CLC_DOUBLEN piby2_h = 0x1.921fb54442d18p+0;
+  const __CLC_DOUBLEN piby2_m = 0x1.1a62633145c00p-54;
+  const __CLC_DOUBLEN piby2_t = 0x1.b839a252049c0p-104;
 
-  const __CLC_DOUBLEN piby2_h = -7074237752028440.0 / 0x1.0p+52;
-  const __CLC_DOUBLEN piby2_m = -2483878800010755.0 / 0x1.0p+105;
-  const __CLC_DOUBLEN piby2_t = -3956492004828932.0 / 0x1.0p+158;
+  __CLC_DOUBLEN dn_pi2 = __clc_rint(x * twobypi);
 
   // Compute product of npi2 with 159 bits of 2/pi
-  __CLC_DOUBLEN p_hh = piby2_h * dnpi2;
-  __CLC_DOUBLEN p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
-  __CLC_DOUBLEN p_mh = piby2_m * dnpi2;
-  __CLC_DOUBLEN p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
-  __CLC_DOUBLEN p_th = piby2_t * dnpi2;
-  __CLC_DOUBLEN p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
+  __CLC_DOUBLEN xt = __clc_fma(dn_pi2, -piby2_h, x);
+  __CLC_DOUBLEN yh = __clc_fma(dn_pi2, -piby2_m, xt);
+  __CLC_DOUBLEN ph = dn_pi2 * piby2_m;
+  __CLC_DOUBLEN pt = __clc_fma(dn_pi2, piby2_m, -ph);
 
   // Reduce to 159 bits
-  __CLC_DOUBLEN ph = p_hh;
-  __CLC_DOU...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/187455
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to