https://github.com/karolherbst created 
https://github.com/llvm/llvm-project/pull/189781

I'm seeing regressions among `atan2`, `atan2pi`, remainder`, `remquo`, `sinpi` 
and other functions.

I'm not in the mood to figure out why those commit broke it, hence I'm simply 
sending a bunch of reverts to let me pass the `math_brute_force` OpenCL CTS 
tests.

>From b9be58300b1c4382eea84d64773d5f7bf0f6885c Mon Sep 17 00:00:00 2001
From: Karol Herbst <[email protected]>
Date: Wed, 1 Apr 2026 02:41:02 +0200
Subject: [PATCH 1/5] Revert "libclc: Update atan2 (#188706)"

This reverts commit 158282b17652acd3a5ab282dbcc54967fd9837d7.
---
 libclc/clc/lib/generic/math/clc_atan2.cl  |  11 +-
 libclc/clc/lib/generic/math/clc_atan2.inc | 305 ++++++++++++++++------
 2 files changed, 224 insertions(+), 92 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_atan2.cl 
b/libclc/clc/lib/generic/math/clc_atan2.cl
index 04bb53e357260..1ba6751f5a0f1 100644
--- a/libclc/clc/lib/generic/math/clc_atan2.cl
+++ b/libclc/clc/lib/generic/math/clc_atan2.cl
@@ -6,20 +6,21 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#include "clc/clc_convert.h"
 #include "clc/float/definitions.h"
 #include "clc/internal/clc.h"
-#include "clc/math/clc_atan_helpers.h"
 #include "clc/math/clc_copysign.h"
 #include "clc/math/clc_fabs.h"
 #include "clc/math/clc_fma.h"
-#include "clc/math/clc_fmax.h"
-#include "clc/math/clc_fmin.h"
 #include "clc/math/clc_ldexp.h"
 #include "clc/math/clc_mad.h"
+#include "clc/math/math.h"
+#include "clc/math/tables.h"
 #include "clc/relational/clc_isinf.h"
-#include "clc/relational/clc_isunordered.h"
+#include "clc/relational/clc_isnan.h"
 #include "clc/relational/clc_select.h"
-#include "clc/relational/clc_signbit.h"
+#include "clc/shared/clc_max.h"
+#include "clc/shared/clc_min.h"
 
 #define __CLC_BODY "clc_atan2.inc"
 #include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_atan2.inc 
b/libclc/clc/lib/generic/math/clc_atan2.inc
index f8e7c9638180d..19eaaeee0092d 100644
--- a/libclc/clc/lib/generic/math/clc_atan2.inc
+++ b/libclc/clc/lib/generic/math/clc_atan2.inc
@@ -6,112 +6,243 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#pragma OPENCL FP_CONTRACT OFF
-
 #if __CLC_FPSIZE == 32
 
-_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_FLOATN __clc_atan2(__CLC_FLOATN y,
-                                                           __CLC_FLOATN x) {
-  const __CLC_FLOATN pi = 0x1.921fb6p+1f;
-  const __CLC_FLOATN piby2 = 0x1.921fb6p+0f;
-  const __CLC_FLOATN piby4 = 0x1.921fb6p-1f;
-  const __CLC_FLOATN threepiby4 = 0x1.2d97c8p+1f;
-
-  __CLC_FLOATN ax = __clc_fabs(x);
-  __CLC_FLOATN ay = __clc_fabs(y);
-  __CLC_FLOATN v = __clc_fmin(ax, ay);
-  __CLC_FLOATN u = __clc_fmax(ax, ay);
-
-  __CLC_FLOATN vbyu = v / u;
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y,
+                                                 __CLC_GENTYPE x) {
+  const __CLC_GENTYPE pi = 0x1.921fb6p+1f;
+  const __CLC_GENTYPE piby2 = 0x1.921fb6p+0f;
+  const __CLC_GENTYPE piby4 = 0x1.921fb6p-1f;
+  const __CLC_GENTYPE threepiby4 = 0x1.2d97c8p+1f;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  __CLC_GENTYPE v = __clc_min(ax, ay);
+  __CLC_GENTYPE u = __clc_max(ax, ay);
+
+  // Scale since u could be large, as in "regular" divide
+  __CLC_GENTYPE s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+  __CLC_GENTYPE vbyu = s * MATH_DIVIDE(v, s * u);
+
+  __CLC_GENTYPE vbyu2 = vbyu * vbyu;
+
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+  __CLC_GENTYPE p =
+      __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f),
+                -0x1.5554d0p-2f) *
+      vbyu2 * vbyu;
+  __CLC_GENTYPE q =
+      __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+  __CLC_GENTYPE p =
+      __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f),
+                -0x1.55554ep-2f) *
+      vbyu2 * vbyu;
+  __CLC_GENTYPE q = __clc_mad(
+      vbyu2,
+      __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f),
+                0x1.76b4b8p+0f),
+      1.0f);
+#endif
 
-  __CLC_FLOATN a = __clc_atan_reduced(vbyu);
+  // Octant 0 result
+  __CLC_GENTYPE a = __clc_mad(p, MATH_RECIP(q), vbyu);
 
-  __CLC_FLOATN t = piby2 - a;
-  a = ay > ax ? t : a;
-  t = pi - a;
-  a = x < 0.0f ? t : a;
+  // Fix up 3 other octants
+  __CLC_GENTYPE at = piby2 - a;
+  a = ay > ax ? at : a;
+  at = pi - a;
+  a = x < 0.0F ? at : a;
 
-  t = __clc_signbit(x) ? pi : 0.0f;
-  a = y == 0.0f ? t : a;
+  // y == 0 => 0 for x >= 0, pi for x < 0
+  at = __CLC_AS_INTN(x) < 0 ? pi : 0.0f;
+  a = y == 0.0f ? at : a;
 
   // x and y are +- Inf
-  t = x < 0.0f ? threepiby4 : piby4;
-  a = (__clc_isinf(x) & __clc_isinf(y)) ? t : a;
+  at = x > 0.0f ? piby4 : threepiby4;
+  a = __clc_select(a, at, __clc_isinf(x) && __clc_isinf(y));
 
   // x or y is NaN
-  a = __clc_isunordered(x, y) ? FLT_NAN : a;
+  a = __clc_select(a, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isnan(y));
 
+  // Fixup sign and return
   return __clc_copysign(a, y);
 }
 
 #elif __CLC_FPSIZE == 64
 
-_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_DOUBLEN __clc_atan2(__CLC_DOUBLEN y,
-                                                            __CLC_DOUBLEN x) {
-  const __CLC_DOUBLEN pi = 0x1.921fb54442d18p+1;
-  const __CLC_DOUBLEN piby2 = 0x1.921fb54442d18p+0;
-  const __CLC_DOUBLEN piby4 = 0x1.921fb54442d18p-1;
-  const __CLC_DOUBLEN threepiby4 = 0x1.2d97c7f3321d2p+1;
-
-  __CLC_DOUBLEN ay = __clc_fabs(y);
-  __CLC_DOUBLEN ax = __clc_fabs(x);
-  __CLC_DOUBLEN u = __clc_fmax(ax, ay);
-  __CLC_DOUBLEN v = __clc_fmin(ax, ay);
-  __CLC_DOUBLEN vbyu = v / u;
-
-  __CLC_DOUBLEN a = __clc_atan_reduced(vbyu);
-  __CLC_LONGN xneg = __clc_signbit(x);
-
-  __CLC_DOUBLEN t = piby2 - a;
-  a = ax < ay ? t : a;
-  t = pi - a;
-  a = xneg ? t : a;
-
-  t = xneg ? pi : 0.0;
-  a = y == 0.0 ? t : a;
-
-  t = xneg ? threepiby4 : piby4;
-  t = __clc_copysign(t, y);
-  a = (__clc_isinf(x) && __clc_isinf(y)) ? t : a;
-
-  a = __clc_isunordered(x, y) ? DBL_NAN : a;
-  return __clc_copysign(a, y);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y,
+                                                 __CLC_GENTYPE x) {
+  const __CLC_GENTYPE pi = 3.1415926535897932e+00;    /* 0x400921fb54442d18 */
+  const __CLC_GENTYPE piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
+  const __CLC_GENTYPE piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */
+  // 0x4002d97c7f3321d2
+  const __CLC_GENTYPE three_piby4 = 2.3561944901923449e+00;
+  const __CLC_GENTYPE pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 
*/
+  const __CLC_GENTYPE pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 
*/
+  // 0x3ff921fb54442d18
+  const __CLC_GENTYPE piby2_head = 1.5707963267948965e+00;
+  // 0x3c91a62633145c07
+  const __CLC_GENTYPE piby2_tail = 6.1232339957367660e-17;
+
+  __CLC_GENTYPE x2 = x;
+  // Important to capture -0.0 in xneg and yneg, so comparison done as integer
+  __CLC_LONGN xneg = __CLC_AS_LONGN(x) < 0;
+  __CLC_INTN xexp =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> EXPSHIFTBITS_DP64) & 0x7ff;
+
+  __CLC_GENTYPE y2 = y;
+  __CLC_LONGN yneg = __CLC_AS_LONGN(y) < 0;
+  __CLC_INTN yexp =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y) >> EXPSHIFTBITS_DP64) & 0x7ff;
+
+  __CLC_LONGN cond2 = __CLC_CONVERT_LONGN(xexp < 1021 && yexp < 1021);
+  __CLC_LONGN diffexp = __CLC_CONVERT_LONGN(yexp - xexp);
+
+  // Scale up both x and y if they are both below 1/4
+  __CLC_GENTYPE x1 = __clc_ldexp(x, 1024);
+  __CLC_INTN xexp1 =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x1) >> EXPSHIFTBITS_DP64) & 0x7ff;
+  __CLC_GENTYPE y1 = __clc_ldexp(y, 1024);
+  __CLC_INTN yexp1 =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y1) >> EXPSHIFTBITS_DP64) & 0x7ff;
+  __CLC_LONGN diffexp1 = __CLC_CONVERT_LONGN(yexp1 - xexp1);
+
+  diffexp = __clc_select(diffexp, diffexp1, cond2);
+  x = cond2 ? x1 : x;
+  y = cond2 ? y1 : y;
+
+  // General case: take absolute values of arguments
+  __CLC_GENTYPE u = __clc_fabs(x);
+  __CLC_GENTYPE v = __clc_fabs(y);
+
+  // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+  __CLC_LONGN swap_vu = u < v;
+  __CLC_GENTYPE uu = u;
+  u = swap_vu ? v : u;
+  v = swap_vu ? uu : v;
+
+  __CLC_GENTYPE vbyu = v / u;
+  __CLC_GENTYPE q1, q2;
+
+  // General values of v/u. Use a look-up table and series expansion.
+
+  {
+    __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063;
+    __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5));
+    q1 = __CLC_USE_TABLE(atan_jby256_tbl_head, index - 16);
+    q2 = __CLC_USE_TABLE(atan_jby256_tbl_tail, index - 16);
+    __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8;
+
+    // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+    // u_exponent could be EMAX so we have to do it in 2 steps
+    __CLC_INTN m =
+        -(__CLC_CONVERT_INTN(__CLC_AS_ULONGN(u) >> EXPSHIFTBITS_DP64) -
+          EXPBIAS_DP64);
+    __CLC_GENTYPE um = __clc_ldexp(u, m);
+    __CLC_GENTYPE vm = __clc_ldexp(v, m);
+
+    // 26 leading bits of u
+    __CLC_GENTYPE u1 =
+        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL);
+    __CLC_GENTYPE u2 = um - u1;
+
+    __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)),
+                                  __clc_fma(c, vm, um));
+
+    // Polynomial approximation to atan(r)
+    __CLC_GENTYPE s = r * r;
+    q2 = q2 + __clc_fma((s * __clc_fma(-s, 0.19999918038989143496,
+                                       0.33333333333224095522)),
+                        -r, r);
+  }
+
+  __CLC_GENTYPE q3, q4;
+  {
+    q3 = 0.0;
+    q4 = vbyu;
+  }
+
+  __CLC_GENTYPE q5, q6;
+  {
+    __CLC_GENTYPE u1 =
+        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL);
+    __CLC_GENTYPE u2 = u - u1;
+    __CLC_GENTYPE vu1 =
+        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL);
+    __CLC_GENTYPE vu2 = vbyu - vu1;
+
+    q5 = 0.0;
+    __CLC_GENTYPE s = vbyu * vbyu;
+    q6 = vbyu +
+         __clc_fma(
+             -vbyu * s,
+             __clc_fma(
+                 -s,
+                 __clc_fma(-s,
+                           __clc_fma(-s,
+                                     __clc_fma(-s, 0.90029810285449784439E-01,
+                                               0.11110736283514525407),
+                                     0.14285713561807169030),
+                           0.19999999999393223405),
+                 0.33333333333333170500),
+             MATH_DIVIDE(__clc_fma(-u, vu2,
+                                   __clc_fma(-u2, vu1, __clc_fma(-u1, vu1, 
v))),
+                         u));
+  }
+
+  q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5;
+  q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+  q1 = vbyu > 0.0625 ? q1 : q3;
+  q2 = vbyu > 0.0625 ? q2 : q4;
+
+  // Tidy-up according to which quadrant the arguments lie in
+  __CLC_GENTYPE res1, res2, res3, res4;
+  q1 = swap_vu ? piby2_head - q1 : q1;
+  q2 = swap_vu ? piby2_tail - q2 : q2;
+  q1 = xneg ? pi_head - q1 : q1;
+  q2 = xneg ? pi_tail - q2 : q2;
+  q1 = q1 + q2;
+  res4 = yneg ? -q1 : q1;
+
+  res1 = yneg ? -three_piby4 : three_piby4;
+  res2 = yneg ? -piby4 : piby4;
+  res3 = xneg ? res1 : res2;
+
+  res3 = __clc_select(res4, res3,
+                      __CLC_CONVERT_LONGN(__clc_isinf(x2) && __clc_isinf(y2)));
+  res1 = yneg ? -pi : pi;
+
+  // abs(x)/abs(y) > 2^56 and x < 0
+  res3 = (diffexp < -56 && xneg) ? res1 : res3;
+
+  res4 = MATH_DIVIDE(y, x);
+  // x positive and dominant over y by a factor of 2^28
+  res3 = diffexp < -28 && xneg == 0 ? res4 : res3;
+
+  // abs(y)/abs(x) > 2^56
+  res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2
+  res3 = diffexp > 56 ? res4 : res3;
+
+  res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of 
y
+  res4 = xneg ? res1 : y2;
+
+  // Zero y gives +-0 for positive x and +-pi for negative x
+  res3 = y2 == 0.0 ? res4 : res3;
+  res3 = __clc_isnan(y2) ? y2 : res3;
+  res3 = __clc_isnan(x2) ? x2 : res3;
+
+  return res3;
 }
 
 #elif __CLC_FPSIZE == 16
 
-_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_HALFN __clc_atan2(__CLC_HALFN y,
-                                                          __CLC_HALFN x) {
-  const __CLC_HALFN pi = 0x1.921fb6p+1h;
-  const __CLC_HALFN piby2 = 0x1.921fb6p+0h;
-  const __CLC_HALFN piby4 = 0x1.921fb6p-1h;
-  const __CLC_HALFN threepiby4 = 0x1.2d97c8p+1h;
-
-  __CLC_HALFN ax = __clc_fabs(x);
-  __CLC_HALFN ay = __clc_fabs(y);
-  __CLC_HALFN v = __clc_fmin(ax, ay);
-  __CLC_HALFN u = __clc_fmax(ax, ay);
-
-  __CLC_HALFN vbyu = v / u;
-
-  __CLC_HALFN a = __clc_atan_reduced(vbyu);
-
-  __CLC_HALFN t = piby2 - a;
-  a = ay > ax ? t : a;
-  t = pi - a;
-  a = x < 0.0h ? t : a;
-
-  t = __clc_signbit(x) ? pi : 0.0h;
-  a = y == 0.0h ? t : a;
-
-  // x and y are +- Inf
-  t = x < 0.0h ? threepiby4 : piby4;
-  a = (__clc_isinf(x) && __clc_isinf(y)) ? t : a;
-
-  // x or y is NaN
-  a = __clc_isunordered(x, y) ? HALF_NAN : a;
-
-  return __clc_copysign(a, y);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE x,
+                                                 __CLC_GENTYPE y) {
+  return __CLC_CONVERT_GENTYPE(
+      __clc_atan2(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
 }
 
 #endif

>From 2ee1ad436bfcd85efd3e95b3e65e64edd0512c08 Mon Sep 17 00:00:00 2001
From: Karol Herbst <[email protected]>
Date: Wed, 1 Apr 2026 02:42:19 +0200
Subject: [PATCH 2/5] Revert "libclc: Update atan2pi (#188707)"

This reverts commit 8430e9e8f043d2ea28925e299157f760711cbedb.
---
 libclc/clc/lib/generic/math/clc_atan2pi.cl  |  10 +-
 libclc/clc/lib/generic/math/clc_atan2pi.inc | 263 ++++++++++++++------
 2 files changed, 198 insertions(+), 75 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.cl 
b/libclc/clc/lib/generic/math/clc_atan2pi.cl
index cd18b4876da86..cee4d8451d85a 100644
--- a/libclc/clc/lib/generic/math/clc_atan2pi.cl
+++ b/libclc/clc/lib/generic/math/clc_atan2pi.cl
@@ -6,19 +6,19 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#include "clc/clc_convert.h"
 #include "clc/float/definitions.h"
 #include "clc/internal/clc.h"
-#include "clc/math/clc_atan_helpers.h"
 #include "clc/math/clc_copysign.h"
 #include "clc/math/clc_fabs.h"
 #include "clc/math/clc_fma.h"
-#include "clc/math/clc_fmax.h"
-#include "clc/math/clc_fmin.h"
+#include "clc/math/clc_ldexp.h"
 #include "clc/math/clc_mad.h"
+#include "clc/math/math.h"
+#include "clc/math/tables.h"
 #include "clc/relational/clc_isinf.h"
-#include "clc/relational/clc_isunordered.h"
+#include "clc/relational/clc_isnan.h"
 #include "clc/relational/clc_select.h"
-#include "clc/relational/clc_signbit.h"
 #include "clc/shared/clc_max.h"
 #include "clc/shared/clc_min.h"
 
diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.inc 
b/libclc/clc/lib/generic/math/clc_atan2pi.inc
index 76b5e45547173..9f901947887d8 100644
--- a/libclc/clc/lib/generic/math/clc_atan2pi.inc
+++ b/libclc/clc/lib/generic/math/clc_atan2pi.inc
@@ -6,99 +6,222 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#pragma OPENCL FP_CONTRACT OFF
-
 #if __CLC_FPSIZE == 32
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_FLOATN __clc_atan2pi(__CLC_FLOATN y,
-                                                             __CLC_FLOATN x) {
-  __CLC_FLOATN ax = __clc_fabs(x);
-  __CLC_FLOATN ay = __clc_fabs(y);
-  __CLC_FLOATN v = __clc_fmin(ax, ay);
-  __CLC_FLOATN u = __clc_fmax(ax, ay);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y,
+                                                   __CLC_GENTYPE x) {
+  const __CLC_GENTYPE pi = 0x1.921fb6p+1f;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  __CLC_GENTYPE v = __clc_min(ax, ay);
+  __CLC_GENTYPE u = __clc_max(ax, ay);
+
+  // Scale since u could be large, as in "regular" divide
+  __CLC_GENTYPE s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+  __CLC_GENTYPE vbyu = s * MATH_DIVIDE(v, s * u);
 
-  __CLC_FLOATN vbyu = v / u;
+  __CLC_GENTYPE vbyu2 = vbyu * vbyu;
 
-  __CLC_FLOATN a = __clc_atanpi_reduced(vbyu);
+  __CLC_GENTYPE p =
+      __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f),
+                -0x1.5554d0p-2f) *
+      vbyu2 * vbyu;
+  __CLC_GENTYPE q =
+      __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
 
-  __CLC_FLOATN at = 0.5f - a;
+  // Octant 0 result
+  __CLC_GENTYPE a = MATH_DIVIDE(__clc_mad(p, MATH_RECIP(q), vbyu), pi);
+
+  // Fix up 3 other octants
+  __CLC_GENTYPE at = 0.5f - a;
   a = ay > ax ? at : a;
   at = 1.0f - a;
-  a = x < 0.0f ? at : a;
+  a = x < 0.0F ? at : a;
 
-  at = __clc_signbit(x) ? 1.0f : 0.0f;
+  // y == 0 => 0 for x >= 0, pi for x < 0
+  at = __CLC_AS_INTN(x) < 0 ? 1.0f : 0.0f;
   a = y == 0.0f ? at : a;
 
   // x and y are +- Inf
-  at = x < 0.0f ? 0.75f : 0.25f;
-  a = (__clc_isinf(x) & __clc_isinf(y)) ? at : a;
+  at = x > 0.0f ? 0.25f : 0.75f;
+  a = __clc_select(a, at, __clc_isinf(x) && __clc_isinf(y));
 
   // x or y is NaN
-  a = __clc_isunordered(x, y) ? FLT_NAN : a;
+  a = __clc_select(a, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isnan(y));
 
+  // Fixup sign and return
   return __clc_copysign(a, y);
 }
 
 #elif __CLC_FPSIZE == 64
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_DOUBLEN __clc_atan2pi(__CLC_DOUBLEN y,
-                                                              __CLC_DOUBLEN x) 
{
-  __CLC_DOUBLEN ay = __clc_fabs(y);
-  __CLC_DOUBLEN ax = __clc_fabs(x);
-  __CLC_DOUBLEN u = __clc_fmax(ax, ay);
-  __CLC_DOUBLEN v = __clc_fmin(ax, ay);
-  __CLC_DOUBLEN vbyu = v / u;
-
-  __CLC_DOUBLEN a = __clc_atanpi_reduced(vbyu);
-
-  __CLC_LONGN xneg = __clc_signbit(x);
-
-  __CLC_DOUBLEN t = 0.5 - a;
-  a = ax < ay ? t : a;
-  t = 1.0 - a;
-  a = xneg ? t : a;
-
-  t = xneg ? 1.0 : 0.0;
-  a = y == 0.0 ? t : a;
-
-  t = xneg ? 0.75 : 0.25;
-  t = __clc_copysign(t, y);
-  a = (__clc_isinf(x) & __clc_isinf(y)) ? t : a;
-
-  a = __clc_isunordered(x, y) ? DBL_NAN : a;
-
-  return __clc_copysign(a, y);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y,
+                                                   __CLC_GENTYPE x) {
+  const __CLC_GENTYPE pi = 3.1415926535897932e+00;      /* 0x400921fb54442d18 
*/
+  const __CLC_GENTYPE pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 
*/
+  const __CLC_GENTYPE pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 
*/
+  // 0x3ff921fb54442d18
+  const __CLC_GENTYPE piby2_head = 1.5707963267948965e+00;
+  // 0x3c91a62633145c07
+  const __CLC_GENTYPE piby2_tail = 6.1232339957367660e-17;
+
+  __CLC_GENTYPE x2 = x;
+  __CLC_LONGN xneg = __CLC_AS_LONGN(x) < 0;
+  __CLC_INTN xexp =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> EXPSHIFTBITS_DP64) & 0x7ff;
+
+  __CLC_GENTYPE y2 = y;
+  __CLC_LONGN yneg = __CLC_AS_LONGN(y) < 0;
+  __CLC_INTN yexp =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y) >> EXPSHIFTBITS_DP64) & 0x7ff;
+
+  __CLC_LONGN cond2 = __CLC_CONVERT_LONGN(xexp < 1021 & yexp < 1021);
+  __CLC_LONGN diffexp = __CLC_CONVERT_LONGN(yexp - xexp);
+
+  // Scale up both x and y if they are both below 1/4
+  __CLC_GENTYPE x1 = __clc_ldexp(x, 1024);
+  __CLC_INTN xexp1 =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x1) >> EXPSHIFTBITS_DP64) & 0x7ff;
+  __CLC_GENTYPE y1 = __clc_ldexp(y, 1024);
+  __CLC_INTN yexp1 =
+      __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y1) >> EXPSHIFTBITS_DP64) & 0x7ff;
+  __CLC_LONGN diffexp1 = __CLC_CONVERT_LONGN(yexp1 - xexp1);
+
+  diffexp = __clc_select(diffexp, diffexp1, cond2);
+  x = cond2 ? x1 : x;
+  y = cond2 ? y1 : y;
+
+  // General case: take absolute values of arguments
+  __CLC_GENTYPE u = __clc_fabs(x);
+  __CLC_GENTYPE v = __clc_fabs(y);
+
+  // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+  __CLC_LONGN swap_vu = u < v;
+  __CLC_GENTYPE uu = u;
+  u = swap_vu ? v : u;
+  v = swap_vu ? uu : v;
+
+  __CLC_GENTYPE vbyu = v / u;
+  __CLC_GENTYPE q1, q2;
+
+  // General values of v/u. Use a look-up table and series expansion.
+
+  {
+    __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063;
+    __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5));
+    q1 = __CLC_USE_TABLE(atan_jby256_tbl_head, (index - 16));
+    q2 = __CLC_USE_TABLE(atan_jby256_tbl_tail, (index - 16));
+    __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8;
+
+    // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+    // u_exponent could be EMAX so we have to do it in 2 steps
+    __CLC_INTN m =
+        -(__CLC_CONVERT_INTN(__CLC_AS_ULONGN(u) >> EXPSHIFTBITS_DP64) -
+          EXPBIAS_DP64);
+    __CLC_GENTYPE um = __clc_ldexp(u, m);
+    __CLC_GENTYPE vm = __clc_ldexp(v, m);
+
+    // 26 leading bits of u
+    __CLC_GENTYPE u1 =
+        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL);
+    __CLC_GENTYPE u2 = um - u1;
+
+    __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)),
+                                  __clc_fma(c, vm, um));
+
+    // Polynomial approximation to atan(r)
+    __CLC_GENTYPE s = r * r;
+    q2 = q2 + __clc_fma((s * __clc_fma(-s, 0.19999918038989143496,
+                                       0.33333333333224095522)),
+                        -r, r);
+  }
+
+  __CLC_GENTYPE q3, q4;
+  {
+    q3 = 0.0;
+    q4 = vbyu;
+  }
+
+  __CLC_GENTYPE q5, q6;
+  {
+    __CLC_GENTYPE u1 =
+        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL);
+    __CLC_GENTYPE u2 = u - u1;
+    __CLC_GENTYPE vu1 =
+        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL);
+    __CLC_GENTYPE vu2 = vbyu - vu1;
+
+    q5 = 0.0;
+    __CLC_GENTYPE s = vbyu * vbyu;
+    q6 = vbyu +
+         __clc_fma(
+             -vbyu * s,
+             __clc_fma(
+                 -s,
+                 __clc_fma(-s,
+                           __clc_fma(-s,
+                                     __clc_fma(-s, 0.90029810285449784439E-01,
+                                               0.11110736283514525407),
+                                     0.14285713561807169030),
+                           0.19999999999393223405),
+                 0.33333333333333170500),
+             MATH_DIVIDE(__clc_fma(-u, vu2,
+                                   __clc_fma(-u2, vu1, __clc_fma(-u1, vu1, 
v))),
+                         u));
+  }
+
+  q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5;
+  q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+  q1 = vbyu > 0.0625 ? q1 : q3;
+  q2 = vbyu > 0.0625 ? q2 : q4;
+
+  // Tidy-up according to which quadrant the arguments lie in
+  __CLC_GENTYPE res1, res2, res3, res4;
+  q1 = swap_vu ? piby2_head - q1 : q1;
+  q2 = swap_vu ? piby2_tail - q2 : q2;
+  q1 = xneg ? pi_head - q1 : q1;
+  q2 = xneg ? pi_tail - q2 : q2;
+  q1 = MATH_DIVIDE(q1 + q2, pi);
+  res4 = yneg ? -q1 : q1;
+
+  res1 = yneg ? -0.75 : 0.75;
+  res2 = yneg ? -0.25 : 0.25;
+  res3 = xneg ? res1 : res2;
+
+  res3 = __clc_select(res4, res3,
+                      __CLC_CONVERT_LONGN(__clc_isinf(y2) & __clc_isinf(x2)));
+  res1 = yneg ? -1.0 : 1.0;
+
+  // abs(x)/abs(y) > 2^56 and x < 0
+  res3 = diffexp < -56 && xneg ? res1 : res3;
+
+  res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi);
+  // x positive and dominant over y by a factor of 2^28
+  res3 = diffexp < -28 && xneg == 0 ? res4 : res3;
+
+  // abs(y)/abs(x) > 2^56
+  res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2
+  res3 = diffexp > 56 ? res4 : res3;
+
+  res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of 
y
+  res4 = xneg ? res1 : y2;
+
+  // Zero y gives +-0 for positive x and +-pi for negative x
+  res3 = y2 == 0.0 ? res4 : res3;
+  res3 = __clc_isnan(y2) ? y2 : res3;
+  res3 = __clc_isnan(x2) ? x2 : res3;
+
+  return res3;
 }
 
 #elif __CLC_FPSIZE == 16
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_HALFN __clc_atan2pi(__CLC_HALFN y,
-                                                            __CLC_HALFN x) {
-  __CLC_HALFN ax = __clc_fabs(x);
-  __CLC_HALFN ay = __clc_fabs(y);
-  __CLC_HALFN v = __clc_fmin(ax, ay);
-  __CLC_HALFN u = __clc_fmax(ax, ay);
-
-  __CLC_HALFN vbyu = v / u;
-
-  __CLC_HALFN a = __clc_atanpi_reduced(vbyu);
-
-  __CLC_HALFN at = 0.5h - a;
-  a = ay > ax ? at : a;
-  at = 1.0h - a;
-  a = x < 0.0h ? at : a;
-
-  at = __clc_signbit(x) ? 1.0h : 0.0h;
-  a = y == 0.0h ? at : a;
-
-  // x and y are +- Inf
-  at = x < 0.0h ? 0.75h : 0.25h;
-  a = (__clc_isinf(x) & __clc_isinf(y)) ? at : a;
-
-  // x or y is NaN
-  a = __clc_isunordered(x, y) ? HALF_NAN : a;
-
-  return __clc_copysign(a, y);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y) {
+  return __CLC_CONVERT_GENTYPE(
+      __clc_atan2pi(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
 }
 
 #endif

>From 457734770dd66b0a2c6f350939baef7e6732be5c Mon Sep 17 00:00:00 2001
From: Karol Herbst <[email protected]>
Date: Wed, 1 Apr 2026 02:43:25 +0200
Subject: [PATCH 3/5] Revert "libclc: Implement remainder with remquo"

This reverts commit befad798a91ef8c74f605bdb15c352528f8d9807.
---
 libclc/clc/lib/generic/math/clc_remainder.cl  | 223 +++++++++++++++++-
 libclc/clc/lib/generic/math/clc_remainder.inc |  13 -
 2 files changed, 221 insertions(+), 15 deletions(-)
 delete mode 100644 libclc/clc/lib/generic/math/clc_remainder.inc

diff --git a/libclc/clc/lib/generic/math/clc_remainder.cl 
b/libclc/clc/lib/generic/math/clc_remainder.cl
index d3979fbac3ffd..f1dba87ee5b43 100644
--- a/libclc/clc/lib/generic/math/clc_remainder.cl
+++ b/libclc/clc/lib/generic/math/clc_remainder.cl
@@ -6,8 +6,227 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#include "clc/clc_convert.h"
+#include "clc/integer/clc_clz.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_floor.h"
+#include "clc/math/clc_fma.h"
+#include "clc/math/clc_ldexp.h"
 #include "clc/math/clc_remainder.h"
-#include "clc/math/clc_remquo.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/shared/clc_max.h"
 
-#define __CLC_BODY "clc_remainder.inc"
+_CLC_DEF _CLC_OVERLOAD float __clc_remainder(float x, float y) {
+  int ux = __clc_as_int(x);
+  int ax = ux & EXSIGNBIT_SP32;
+  float xa = __clc_as_float(ax);
+  int sx = ux ^ ax;
+  int ex = ax >> EXPSHIFTBITS_SP32;
+
+  int uy = __clc_as_int(y);
+  int ay = uy & EXSIGNBIT_SP32;
+  float ya = __clc_as_float(ay);
+  int ey = ay >> EXPSHIFTBITS_SP32;
+
+  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
+  int c;
+  int k = ex - ey;
+
+  uint q = 0;
+
+  while (k > 0) {
+    c = xr >= yr;
+    q = (q << 1) | c;
+    xr -= c ? yr : 0.0f;
+    xr += xr;
+    --k;
+  }
+
+  c = xr > yr;
+  q = (q << 1) | c;
+  xr -= c ? yr : 0.0f;
+
+  int lt = ex < ey;
+
+  q = lt ? 0 : q;
+  xr = lt ? xa : xr;
+  yr = lt ? ya : yr;
+
+  c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
+  xr -= c ? yr : 0.0f;
+  q += c;
+
+  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
+  xr *= lt ? 1.0f : s;
+
+  c = ax == ay;
+  xr = c ? 0.0f : xr;
+
+  xr = __clc_as_float(sx ^ __clc_as_int(xr));
+
+  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
+      ay == 0;
+  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
+
+  return xr;
+}
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_remainder
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
 #include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
+  ulong ux = __clc_as_ulong(x);
+  ulong ax = ux & ~SIGNBIT_DP64;
+  ulong xsgn = ux ^ ax;
+  double dx = __clc_as_double(ax);
+  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
+  int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
+  xexp1 = xexp < 1 ? xexp1 : xexp;
+
+  ulong uy = __clc_as_ulong(y);
+  ulong ay = uy & ~SIGNBIT_DP64;
+  double dy = __clc_as_double(ay);
+  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
+  int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
+  yexp1 = yexp < 1 ? yexp1 : yexp;
+
+  int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
+
+  // First assume |x| > |y|
+
+  // Set ntimes to the number of times we need to do a
+  // partial remainder. If the exponent of x is an exact multiple
+  // of 53 larger than the exponent of y, and the mantissa of x is
+  // less than the mantissa of y, ntimes will be one too large
+  // but it doesn't matter - it just means that we'll go round
+  // the loop below one extra time.
+  int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
+  double w = __clc_ldexp(dy, ntimes * 53);
+  w = ntimes == 0 ? dy : w;
+  double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+  // Each time round the loop we compute a partial remainder.
+  // This is done by subtracting a large multiple of w
+  // from x each time, where w is a scaled up version of y.
+  // The subtraction must be performed exactly in quad
+  // precision, though the result at each stage can
+  // fit exactly in a double precision number.
+  int i;
+  double t, v, p, pp;
+
+  for (i = 0; i < ntimes; i++) {
+    // Compute integral multiplier
+    t = __clc_trunc(dx / w);
+
+    // Compute w * t in quad precision
+    p = w * t;
+    pp = __clc_fma(w, t, -p);
+
+    // Subtract w * t from dx
+    v = dx - p;
+    dx = v + (((dx - v) - p) - pp);
+
+    // If t was one too large, dx will be negative. Add back one w.
+    dx += dx < 0.0 ? w : 0.0;
+
+    // Scale w down by 2^(-53) for the next iteration
+    w *= scale;
+  }
+
+  // One more time
+  // Variable todd says whether the integer t is odd or not
+  t = __clc_floor(dx / w);
+  long lt = (long)t;
+  int todd = lt & 1;
+
+  p = w * t;
+  pp = __clc_fma(w, t, -p);
+  v = dx - p;
+  dx = v + (((dx - v) - p) - pp);
+  i = dx < 0.0;
+  todd ^= i;
+  dx += i ? w : 0.0;
+
+  // At this point, dx lies in the range [0,dy)
+
+  // For the fmod function, we're done apart from setting the correct sign.
+  //
+  // For the remainder function, we need to adjust dx
+  // so that it lies in the range (-y/2, y/2] by carefully
+  // subtracting w (== dy == y) if necessary. The rigmarole
+  // with todd is to get the correct sign of the result
+  // when x/y lies exactly half way between two integers,
+  // when we need to choose the even integer.
+
+  int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
+  double dxl = dx - (al ? w : 0.0);
+
+  int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
+  double dxg = dx - (ag ? w : 0.0);
+
+  dx = dy < 0x1.0p+1022 ? dxl : dxg;
+
+  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+  dx = __clc_as_double(ax);
+
+  // Now handle |x| == |y|
+  int c = dx == dy;
+  t = __clc_as_double(xsgn);
+  ret = c ? t : ret;
+
+  // Next, handle |x| < |y|
+  c = dx < dy;
+  ret = c ? x : ret;
+
+  c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy);
+  // we could use a conversion here instead since qsgn = +-1
+  p = qsgn == 1 ? -1.0 : 1.0;
+  t = __clc_fma(y, p, x);
+  ret = c ? t : ret;
+
+  // We don't need anything special for |x| == 0
+
+  // |y| is 0
+  c = dy == 0.0;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+  // y is +-Inf, NaN
+  c = yexp > BIASEDEMAX_DP64;
+  t = y == y ? x : y;
+  ret = c ? t : ret;
+
+  // x is +=Inf, NaN
+  c = xexp > BIASEDEMAX_DP64;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+  return ret;
+}
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_FUNCTION __clc_remainder
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// Forward the half version of this builtin onto the float one
+#define __CLC_HALF_ONLY
+#define __CLC_FUNCTION __clc_remainder
+#define __CLC_BODY "clc/math/binary_def_via_fp32.inc"
+#include "clc/math/gentype.inc"
+
+#endif
diff --git a/libclc/clc/lib/generic/math/clc_remainder.inc 
b/libclc/clc/lib/generic/math/clc_remainder.inc
deleted file mode 100644
index 9f2e0ba1539a6..0000000000000
--- a/libclc/clc/lib/generic/math/clc_remainder.inc
+++ /dev/null
@@ -1,13 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
-__clc_remainder(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-  __CLC_INTN unused_quo;
-  return __clc_remquo(x, y, &unused_quo);
-}

>From e30a3c43bff5ffa50ca14570173bea3eefd3f703 Mon Sep 17 00:00:00 2001
From: Karol Herbst <[email protected]>
Date: Wed, 1 Apr 2026 02:44:30 +0200
Subject: [PATCH 4/5] Revert "libclc: Update remquo (#187998)"

This reverts commit 1a9fe1769a0f9684dc9ecfcb7a2cec2d66077cc3.
---
 libclc/clc/include/clc/math/remquo_decl.inc   |  34 +--
 libclc/clc/lib/generic/math/clc_remquo.cl     |  56 +---
 libclc/clc/lib/generic/math/clc_remquo.inc    | 273 +++++++++++++++++-
 .../clc/lib/generic/math/clc_remquo_stret.inc | 158 ----------
 4 files changed, 287 insertions(+), 234 deletions(-)
 delete mode 100644 libclc/clc/lib/generic/math/clc_remquo_stret.inc

diff --git a/libclc/clc/include/clc/math/remquo_decl.inc 
b/libclc/clc/include/clc/math/remquo_decl.inc
index 8ba601199ef0f..cba28a7244eb4 100644
--- a/libclc/clc/include/clc/math/remquo_decl.inc
+++ b/libclc/clc/include/clc/math/remquo_decl.inc
@@ -6,29 +6,19 @@
 //
 
//===----------------------------------------------------------------------===//
 
-typedef struct __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE) {
-  __CLC_GENTYPE rem;
-  __CLC_INTN quo;
-} __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     private __CLC_INTN *q);
 
-#define __CLC_REMQUO_RET_GENTYPE __CLC_XCONCAT(__clc_remquo_ret_, 
__CLC_GENTYPE)
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     global __CLC_INTN *q);
 
-_CLC_OVERLOAD _CLC_DECL __CLC_REMQUO_RET_GENTYPE
-__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y);
-
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   private __CLC_INTN *q);
-
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   global __CLC_INTN *q);
-
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   local __CLC_INTN *q);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     local __CLC_INTN *q);
 #if _CLC_GENERIC_AS_SUPPORTED
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   generic __CLC_INTN *q);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     generic __CLC_INTN *q);
 #endif
diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl 
b/libclc/clc/lib/generic/math/clc_remquo.cl
index 502b9e5edc405..e254093d591d4 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -6,58 +6,32 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#include "clc/math/clc_remquo.h"
-
 #include "clc/clc_convert.h"
-#include "clc/float/definitions.h"
 #include "clc/integer/clc_clz.h"
-#include "clc/math/clc_copysign.h"
-#include "clc/math/clc_fabs.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_floor.h"
 #include "clc/math/clc_flush_if_daz.h"
 #include "clc/math/clc_fma.h"
-#include "clc/math/clc_frexp.h"
 #include "clc/math/clc_ldexp.h"
-#include "clc/math/clc_mad.h"
-#include "clc/math/clc_recip_fast.h"
-#include "clc/math/clc_rint.h"
 #include "clc/math/clc_subnormal_config.h"
 #include "clc/math/clc_trunc.h"
 #include "clc/math/math.h"
-#include "clc/relational/clc_isfinite.h"
-#include "clc/relational/clc_isnan.h"
-#include "clc/relational/clc_signbit.h"
-
-#define __CLC_FUNCTION __clc_remquo_stret
-#define __CLC_BODY "clc_remquo_stret.inc"
-#include "clc/math/gentype.inc"
-#undef __CLC_FUNCTION
-
-#define __CLC_FUNCTION __clc_remquo
-#define __CLC_BODY "clc_remquo.inc"
-#include "clc/math/gentype.inc"
+#include "clc/shared/clc_max.h"
 
-#define __CLC_OUT_ARG3_SCALAR_TYPE int
-#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
-#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
-#include "clc/math/gentype.inc"
-#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_ADDRESS_SPACE private
+#include "clc_remquo.inc"
+#undef __CLC_ADDRESS_SPACE
 
-#define __CLC_OUT_ARG3_SCALAR_TYPE int
-#define __CLC_OUT_ARG3_ADDRESS_SPACE __local
-#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
-#include "clc/math/gentype.inc"
-#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_ADDRESS_SPACE global
+#include "clc_remquo.inc"
+#undef __CLC_ADDRESS_SPACE
 
-#define __CLC_OUT_ARG3_SCALAR_TYPE int
-#define __CLC_OUT_ARG3_ADDRESS_SPACE __global
-#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
-#include "clc/math/gentype.inc"
-#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_ADDRESS_SPACE local
+#include "clc_remquo.inc"
+#undef __CLC_ADDRESS_SPACE
 
 #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-#define __CLC_OUT_ARG3_SCALAR_TYPE int
-#define __CLC_OUT_ARG3_ADDRESS_SPACE
-#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
-#include "clc/math/gentype.inc"
-#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_ADDRESS_SPACE generic
+#include "clc_remquo.inc"
+#undef __CLC_ADDRESS_SPACE
 #endif
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc 
b/libclc/clc/lib/generic/math/clc_remquo.inc
index 649bdd9ee8b65..cf8a5ebcea20c 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -6,19 +6,266 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#ifdef __CLC_SCALAR
-#define __CLC_REMQUO_DEF(addrspace)                                            
\
-  _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_remquo(                           
\
-      __CLC_GENTYPE x, __CLC_GENTYPE y, addrspace __CLC_INTN *quo_out) {       
\
-    __CLC_REMQUO_RET_GENTYPE result = __clc_remquo_stret(x, y);                
\
-    *quo_out = result.quo;                                                     
\
-    return result.rem;                                                         
\
+_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
+                                          __CLC_ADDRESS_SPACE int *quo) {
+  x = __clc_flush_if_daz(x);
+  y = __clc_flush_if_daz(y);
+  int ux = __clc_as_int(x);
+  int ax = ux & EXSIGNBIT_SP32;
+  float xa = __clc_as_float(ax);
+  int sx = ux ^ ax;
+  int ex = ax >> EXPSHIFTBITS_SP32;
+
+  int uy = __clc_as_int(y);
+  int ay = uy & EXSIGNBIT_SP32;
+  float ya = __clc_as_float(ay);
+  int sy = uy ^ ay;
+  int ey = ay >> EXPSHIFTBITS_SP32;
+
+  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
+  int c;
+  int k = ex - ey;
+
+  uint q = 0;
+
+  while (k > 0) {
+    c = xr >= yr;
+    q = (q << 1) | c;
+    xr -= c ? yr : 0.0f;
+    xr += xr;
+    --k;
+  }
+
+  c = xr > yr;
+  q = (q << 1) | c;
+  xr -= c ? yr : 0.0f;
+
+  int lt = ex < ey;
+
+  q = lt ? 0 : q;
+  xr = lt ? xa : xr;
+  yr = lt ? ya : yr;
+
+  c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
+  xr -= c ? yr : 0.0f;
+  q += c;
+
+  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
+  xr *= lt ? 1.0f : s;
+
+  int qsgn = sx == sy ? 1 : -1;
+  int quot = (q & 0x7f) * qsgn;
+
+  c = ax == ay;
+  quot = c ? qsgn : quot;
+  xr = c ? 0.0f : xr;
+
+  xr = __clc_as_float(sx ^ __clc_as_int(xr));
+
+  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
+      ay == 0;
+  quot = c ? 0 : quot;
+  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
+
+  *quo = quot;
+
+  return xr;
+}
+
+// remquo signature is special, we don't have macro for this
+#define __CLC_VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE)                        
\
+  _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo(                          
\
+      TYPE##VEC_SIZE x, TYPE##VEC_SIZE y,                                      
\
+      __CLC_ADDRESS_SPACE int##VEC_SIZE *quo) {                                
\
+    int##HALF_VEC_SIZE lo, hi;                                                 
\
+    TYPE##VEC_SIZE ret;                                                        
\
+    ret.lo = __clc_remquo(x.lo, y.lo, &lo);                                    
\
+    ret.hi = __clc_remquo(x.hi, y.hi, &hi);                                    
\
+    (*quo).lo = lo;                                                            
\
+    (*quo).hi = hi;                                                            
\
+    return ret;                                                                
\
   }
 
-__CLC_REMQUO_DEF(private)
-__CLC_REMQUO_DEF(local)
-__CLC_REMQUO_DEF(global)
-#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-__CLC_REMQUO_DEF(generic)
+#define __CLC_VEC3_REMQUO(TYPE)                                                
\
+  _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo(                                 
\
+      TYPE##3 x, TYPE##3 y, __CLC_ADDRESS_SPACE int##3 * quo) {                
\
+    int2 lo;                                                                   
\
+    int hi;                                                                    
\
+    TYPE##3 ret;                                                               
\
+    ret.s01 = __clc_remquo(x.s01, y.s01, &lo);                                 
\
+    ret.s2 = __clc_remquo(x.s2, y.s2, &hi);                                    
\
+    (*quo).s01 = lo;                                                           
\
+    (*quo).s2 = hi;                                                            
\
+    return ret;                                                                
\
+  }
+__CLC_VEC_REMQUO(float, 2, )
+__CLC_VEC3_REMQUO(float)
+__CLC_VEC_REMQUO(float, 4, 2)
+__CLC_VEC_REMQUO(float, 8, 4)
+__CLC_VEC_REMQUO(float, 16, 8)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
+                                           __CLC_ADDRESS_SPACE int *pquo) {
+  ulong ux = __clc_as_ulong(x);
+  ulong ax = ux & ~SIGNBIT_DP64;
+  ulong xsgn = ux ^ ax;
+  double dx = __clc_as_double(ax);
+  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
+  int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
+  xexp1 = xexp < 1 ? xexp1 : xexp;
+
+  ulong uy = __clc_as_ulong(y);
+  ulong ay = uy & ~SIGNBIT_DP64;
+  double dy = __clc_as_double(ay);
+  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
+  int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
+  yexp1 = yexp < 1 ? yexp1 : yexp;
+
+  int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
+
+  // First assume |x| > |y|
+
+  // Set ntimes to the number of times we need to do a
+  // partial remainder. If the exponent of x is an exact multiple
+  // of 53 larger than the exponent of y, and the mantissa of x is
+  // less than the mantissa of y, ntimes will be one too large
+  // but it doesn't matter - it just means that we'll go round
+  // the loop below one extra time.
+  int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
+  double w = __clc_ldexp(dy, ntimes * 53);
+  w = ntimes == 0 ? dy : w;
+  double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+  // Each time round the loop we compute a partial remainder.
+  // This is done by subtracting a large multiple of w
+  // from x each time, where w is a scaled up version of y.
+  // The subtraction must be performed exactly in quad
+  // precision, though the result at each stage can
+  // fit exactly in a double precision number.
+  int i;
+  double t, v, p, pp;
+
+  for (i = 0; i < ntimes; i++) {
+    // Compute integral multiplier
+    t = __clc_trunc(dx / w);
+
+    // Compute w * t in quad precision
+    p = w * t;
+    pp = __clc_fma(w, t, -p);
+
+    // Subtract w * t from dx
+    v = dx - p;
+    dx = v + (((dx - v) - p) - pp);
+
+    // If t was one too large, dx will be negative. Add back one w.
+    dx += dx < 0.0 ? w : 0.0;
+
+    // Scale w down by 2^(-53) for the next iteration
+    w *= scale;
+  }
+
+  // One more time
+  // Variable todd says whether the integer t is odd or not
+  t = __clc_floor(dx / w);
+  long lt = (long)t;
+  int todd = lt & 1;
+
+  p = w * t;
+  pp = __clc_fma(w, t, -p);
+  v = dx - p;
+  dx = v + (((dx - v) - p) - pp);
+  i = dx < 0.0;
+  todd ^= i;
+  dx += i ? w : 0.0;
+
+  lt -= i;
+
+  // At this point, dx lies in the range [0,dy)
+
+  // For the remainder function, we need to adjust dx
+  // so that it lies in the range (-y/2, y/2] by carefully
+  // subtracting w (== dy == y) if necessary. The rigmarole
+  // with todd is to get the correct sign of the result
+  // when x/y lies exactly half way between two integers,
+  // when we need to choose the even integer.
+
+  int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
+  double dxl = dx - (al ? w : 0.0);
+
+  int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
+  double dxg = dx - (ag ? w : 0.0);
+
+  dx = dy < 0x1.0p+1022 ? dxl : dxg;
+  lt += dy < 0x1.0p+1022 ? al : ag;
+  int quo = ((int)lt & 0x7f) * qsgn;
+
+  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+  dx = __clc_as_double(ax);
+
+  // Now handle |x| == |y|
+  int c = dx == dy;
+  t = __clc_as_double(xsgn);
+  quo = c ? qsgn : quo;
+  ret = c ? t : ret;
+
+  // Next, handle |x| < |y|
+  c = dx < dy;
+  quo = c ? 0 : quo;
+  ret = c ? x : ret;
+
+  c &= (yexp < 1023 & 2.0 * dx > dy) | (dx > 0.5 * dy);
+  quo = c ? qsgn : quo;
+  // we could use a conversion here instead since qsgn = +-1
+  p = qsgn == 1 ? -1.0 : 1.0;
+  t = __clc_fma(y, p, x);
+  ret = c ? t : ret;
+
+  // We don't need anything special for |x| == 0
+
+  // |y| is 0
+  c = dy == 0.0;
+  quo = c ? 0 : quo;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+  // y is +-Inf, NaN
+  c = yexp > BIASEDEMAX_DP64;
+  quo = c ? 0 : quo;
+  t = y == y ? x : y;
+  ret = c ? t : ret;
+
+  // x is +=Inf, NaN
+  c = xexp > BIASEDEMAX_DP64;
+  quo = c ? 0 : quo;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+  *pquo = quo;
+  return ret;
+}
+__CLC_VEC_REMQUO(double, 2, )
+__CLC_VEC3_REMQUO(double)
+__CLC_VEC_REMQUO(double, 4, 2)
+__CLC_VEC_REMQUO(double, 8, 4)
+__CLC_VEC_REMQUO(double, 16, 8)
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y,
+                                         __CLC_ADDRESS_SPACE int *pquo) {
+  return (half)__clc_remquo((float)x, (float)y, pquo);
+}
+__CLC_VEC_REMQUO(half, 2, )
+__CLC_VEC3_REMQUO(half)
+__CLC_VEC_REMQUO(half, 4, 2)
+__CLC_VEC_REMQUO(half, 8, 4)
+__CLC_VEC_REMQUO(half, 16, 8)
+
 #endif
-#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/math/clc_remquo_stret.inc 
b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
deleted file mode 100644
index eecc25525d8d7..0000000000000
--- a/libclc/clc/lib/generic/math/clc_remquo_stret.inc
+++ /dev/null
@@ -1,158 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef __CLC_SCALAR
-
-#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
-#define __CLC_REMQUO_EVAL_TYPE __CLC_GENTYPE
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_GENTYPE
-#define __CLC_S_EVAL_TYPE __CLC_S_GENTYPE
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_S_GENTYPE
-#elif __CLC_FPSIZE == 16
-#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
-#define __CLC_S_EVAL_TYPE __CLC_INTN
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
-#endif
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_REMQUO_RET_GENTYPE
-__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-  // How many bits of the quotient per iteration
-
-#if __CLC_FPSIZE == 32
-  const __CLC_INTN bits = 12;
-  const __CLC_GENTYPE max_exp = 0x1.0p+127f;
-#elif __CLC_FPSIZE == 64
-  const __CLC_INTN bits = 26;
-  const __CLC_GENTYPE max_exp = 0x1.0p+1023;
-#elif __CLC_FPSIZE == 16
-  const __CLC_INTN bits = 11;
-  const __CLC_GENTYPE max_exp = 0x1.0p+15h;
-#endif
-
-  // Track low 7 bits of the integral quotient.
-  __CLC_INTN q7;
-
-  __CLC_REMQUO_EVAL_TYPE ax = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(x));
-  __CLC_REMQUO_EVAL_TYPE ay = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(y));
-
-  __CLC_GENTYPE ret;
-
-  if (ax > ay) {
-    __CLC_INTN ex, ey;
-
-    __CLC_REMQUO_EVAL_TYPE mx = __clc_frexp(ax, &ex);
-    --ex;
-
-    __CLC_REMQUO_EVAL_TYPE my = __clc_frexp(ay, &ey);
-    --ey;
-
-    ax = __clc_ldexp(mx, bits);
-    ay = __clc_ldexp(my, 1);
-
-    __CLC_INTN nb = ex - ey;
-    __CLC_REMQUO_EVAL_TYPE ayinv = __clc_recip_fast(ay);
-
-    __CLC_INTN qacc = 0;
-
-    while (nb > bits) {
-      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
-
-#if __CLC_FPSIZE == 16
-      ax = __clc_mad(-q, ay, ax);
-#else
-      ax = __clc_fma(-q, ay, ax);
-#endif
-      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
-      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
-      ax = clt ? axp : ax;
-      ax = __clc_ldexp(ax, bits);
-
-      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
-      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
-      qacc = (qacc << bits) | iq;
-
-      nb -= bits;
-    }
-
-    ax = __clc_ldexp(ax, nb - bits + 1);
-
-    __CLC_INTN is_odd;
-
-    // Final iteration
-    {
-      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
-#if __CLC_FPSIZE == 16
-      ax = __clc_mad(-q, ay, ax);
-#else
-      ax = __clc_fma(-q, ay, ax);
-#endif
-
-      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
-      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
-      ax = clt ? axp : ax;
-      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
-      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
-
-      qacc = (qacc << (nb + 1)) | iq;
-      is_odd = (iq & 1) != 0;
-    }
-
-    // Adjust ax so that it is the range (-y/2, y/2]
-    // We need to choose the even integer when x/y is midway between two
-    // integers
-    __CLC_S_EVAL_TYPE aq = ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay) |
-                           (__CLC_CONVERT_S_EVAL_TYPE(is_odd) &
-                            ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax == ay));
-    ax = ax - (aq ? ay : (__CLC_REMQUO_EVAL_TYPE)0.0);
-
-    ax = __clc_ldexp(ax, ey);
-    qacc += aq ? 1 : 0;
-
-    __CLC_S_GENTYPE qneg = __clc_signbit(x) ^ __clc_signbit(y) ? -1 : 0;
-    q7 = ((qacc & 0x7f) ^ qneg) - qneg;
-
-    ret = __clc_signbit(x) ? -ax : ax;
-  } else {
-    __CLC_S_EVAL_TYPE c = (ax > (__CLC_REMQUO_EVAL_TYPE)0.5 * ay);
-    if (__CLC_FPSIZE != 16)
-      c |= (ay < max_exp && (__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay);
-
-    __CLC_CHARN qsgn = __CLC_CONVERT_CHARN(__clc_signbit(x) == 
__clc_signbit(y))
-                           ? (__CLC_CHARN)1
-                           : (__CLC_CHARN)-1;
-
-    __CLC_GENTYPE t = __clc_mad(y, -__CLC_CONVERT_GENTYPE(qsgn), x);
-    ret = c ? t : __clc_flush_if_daz(x);
-    q7 = c ? qsgn : 0;
-
-    __CLC_GENTYPE zero = __clc_copysign(__CLC_FP_LIT(0.0), x);
-    ret = ax == ay ? zero : ret;
-    q7 = ax == ay ? qsgn : q7;
-  }
-
-  ret = y == __CLC_FP_LIT(0.0) ? __CLC_GENTYPE_NAN : ret;
-  q7 = y == __CLC_FP_LIT(0.0) ? 0 : q7;
-
-  __CLC_S_GENTYPE finite = !__clc_isnan(y) && __clc_isfinite(x);
-
-  // A defined 0 result for quo with a nan result is an additional OpenCL
-  // requirement beyond standard C.
-  __CLC_REMQUO_RET_GENTYPE result;
-  result.quo = finite ? q7 : 0;
-  result.rem = finite ? ret : __CLC_GENTYPE_NAN;
-
-  return result;
-}
-
-#undef __CLC_REMQUO_EVAL_TYPE
-#undef __CLC_CONVERT_REMQUO_EVAL_TYPE
-#undef __CLC_S_EVAL_TYPE
-#undef __CLC_CONVERT_S_EVAL_TYPE
-
-#endif // __CLC_SCALAR

>From 64fdb5b8c664cd2b1a2ecc11224627f518297723 Mon Sep 17 00:00:00 2001
From: Karol Herbst <[email protected]>
Date: Wed, 1 Apr 2026 02:55:52 +0200
Subject: [PATCH 5/5] Partly revert "libclc: Update trigpi functions (#187579)"

The initial commit broke sinpi

This partly reverts commit 421bf13e4bf1c19a0cc48fbb0af63b902d3118b1
---
 .../clc/math/clc_sincos_helpers_fp64_decl.inc |   5 +
 libclc/clc/lib/generic/math/clc_sinpi.cl      |   8 +-
 libclc/clc/lib/generic/math/clc_sinpi.inc     | 106 +++++++++++++++++-
 3 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc 
b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
index 22081d94dee51..d638d72ab8494 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
@@ -22,6 +22,11 @@ _CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN 
__clc_tan_reduced_eval(__CLC_DOUBLEN x,
                                                             __CLC_DOUBLEN y,
                                                             __CLC_INTN is_odd);
 
+_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
+                                                __CLC_DOUBLEN xx,
+                                                private __CLC_DOUBLEN *sinval,
+                                                private __CLC_DOUBLEN *cosval);
+
 _CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
     __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr);
 
diff --git a/libclc/clc/lib/generic/math/clc_sinpi.cl 
b/libclc/clc/lib/generic/math/clc_sinpi.cl
index bf57dc45ccd1e..f9bed7ec78306 100644
--- a/libclc/clc/lib/generic/math/clc_sinpi.cl
+++ b/libclc/clc/lib/generic/math/clc_sinpi.cl
@@ -6,8 +6,12 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#include "clc/math/clc_sincospi.h"
-#include "clc/math/clc_sinpi.h"
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_sincos_helpers.h"
+#include "clc/math/math.h"
 
 #define __CLC_BODY "clc_sinpi.inc"
 #include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_sinpi.inc 
b/libclc/clc/lib/generic/math/clc_sinpi.inc
index a6e5ee73b1650..264609aeaca45 100644
--- a/libclc/clc/lib/generic/math/clc_sinpi.inc
+++ b/libclc/clc/lib/generic/math/clc_sinpi.inc
@@ -6,7 +6,109 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#if __CLC_FPSIZE == 32
+
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
-  __CLC_GENTYPE unused_cos;
-  return __clc_sincospi(x, &unused_cos);
+  __CLC_INTN ix = __CLC_AS_INTN(x);
+  __CLC_INTN xsgn = ix & (__CLC_INTN)0x80000000;
+  ix ^= xsgn;
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_INTN xodd =
+      xsgn ^ ((iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0);
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_INTN ir = QNANBITPATT_SP32;
+
+  // 2^23 <= |x| < Inf, the result is always integer
+  ir = ix < PINFBITPATT_SP32 ? xsgn : ir;
+
+  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0f - r;
+  __CLC_INTN e = 0;
+
+  // r <= 0.75
+  __CLC_INTN c = r <= 0.75f;
+  a = c ? r - 0.5f : a;
+  e = c ? 1 : e;
+
+  // r < 0.5
+  c = r < 0.5f;
+  a = c ? 0.5f - r : a;
+
+  // 0 < r <= 0.25
+  c = r <= 0.25f;
+  a = c ? r : a;
+  e = c ? 0 : e;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval);
+  __CLC_INTN jr = xodd ^ __CLC_AS_INTN(e != 0 ? cosval : sinval);
+
+  ir = ix < 0x4b000000 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
 }
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
+  __CLC_LONGN ix = __CLC_AS_LONGN(x);
+  __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L;
+  ix ^= xsgn;
+  __CLC_GENTYPE absx = __clc_fabs(x);
+  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
+  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
+  __CLC_LONGN xodd =
+      xsgn ^
+      ((iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L);
+
+  // Initialize with return for +-Inf and NaN
+  __CLC_LONGN ir = QNANBITPATT_DP64;
+
+  // 2^23 <= |x| < Inf, the result is always integer
+  ir = ix < PINFBITPATT_DP64 ? xsgn : ir;
+
+  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+  // r < 1.0
+  __CLC_GENTYPE a = 1.0 - r;
+  __CLC_LONGN e = 0;
+
+  //  r <= 0.75
+  __CLC_LONGN c = r <= 0.75;
+  __CLC_GENTYPE t = r - 0.5;
+  a = c ? t : a;
+  e = c ? 1 : e;
+
+  // r < 0.5
+  c = r < 0.5;
+  t = 0.5 - r;
+  a = c ? t : a;
+
+  // r <= 0.25
+  c = r <= 0.25;
+  a = c ? r : a;
+  e = c ? 0 : e;
+
+  __CLC_GENTYPE api = a * M_PI;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(api, 0.0, &sinval, &cosval);
+  __CLC_LONGN jr = xodd ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval);
+
+  ir = absx < 0x1.0p+52 ? jr : ir;
+
+  return __CLC_AS_GENTYPE(ir);
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_sinpi(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to