Author: Matt Arsenault
Date: 2026-03-23T10:06:59Z
New Revision: 1a9fe1769a0f9684dc9ecfcb7a2cec2d66077cc3

URL: 
https://github.com/llvm/llvm-project/commit/1a9fe1769a0f9684dc9ecfcb7a2cec2d66077cc3
DIFF: 
https://github.com/llvm/llvm-project/commit/1a9fe1769a0f9684dc9ecfcb7a2cec2d66077cc3.diff

LOG: libclc: Update remquo (#187998)

This was failing in the float case without -cl-denorms-are-zero
and failing for double. This now passes in all cases.

This was originally ported from rocm device libs in
8db45e4cf170cc6044a0afe7a0ed8876dcd9a863. This is mostly a port
in of more recent changes with a few changes.

- Templatification, which almost but doesn't quite enable
  vectorization yet due to the outer branch and loop.

- Merging of the 3 types into one shared code path, instead of
  duplicating  per type with 3 different functions implemented together.
  There are only some slight differences for the half case, which mostly
  evaluates as float.

- Splitting out of the is_odd tracking, instead of deriving it from the
  accumulated quotient. This costs an extra register, but saves several
instructions. This also enables automatic elimination of all of the quo
  output handling when this code is reused for remainder. I'm guessing
  this would be unnecessary if SimplifyDemandedBits handled phis.

- Removal of the slow FMA path. I don't see how this would ever be
  faster with the number of instructions replacing it. This is really a
  problem for the compiler to solve anyway.

Added: 
    libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
    libclc/clc/lib/generic/math/clc_remquo_stret.inc

Modified: 
    libclc/clc/include/clc/math/remquo_decl.inc
    libclc/clc/lib/generic/math/clc_remquo.cl
    libclc/clc/lib/generic/math/clc_remquo.inc

Removed: 
    


################################################################################
diff  --git a/libclc/clc/include/clc/math/remquo_decl.inc 
b/libclc/clc/include/clc/math/remquo_decl.inc
index cba28a7244eb4..8ba601199ef0f 100644
--- a/libclc/clc/include/clc/math/remquo_decl.inc
+++ b/libclc/clc/include/clc/math/remquo_decl.inc
@@ -6,19 +6,29 @@
 //
 
//===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     private __CLC_INTN *q);
+typedef struct __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE) {
+  __CLC_GENTYPE rem;
+  __CLC_INTN quo;
+} __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE);
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     global __CLC_INTN *q);
+#define __CLC_REMQUO_RET_GENTYPE __CLC_XCONCAT(__clc_remquo_ret_, 
__CLC_GENTYPE)
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     local __CLC_INTN *q);
+_CLC_OVERLOAD _CLC_DECL __CLC_REMQUO_RET_GENTYPE
+__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   private __CLC_INTN *q);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   global __CLC_INTN *q);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   local __CLC_INTN *q);
 #if _CLC_GENERIC_AS_SUPPORTED
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     generic __CLC_INTN *q);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   generic __CLC_INTN *q);
 #endif

diff  --git a/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc 
b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
new file mode 100644
index 0000000000000..2c233b36cc73c
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+#ifndef __CLC_RET_SCALAR_TYPE
+#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG1_SCALAR_TYPE
+#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG2_SCALAR_TYPE
+#define __CLC_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_OUT_ARG3_SCALAR_TYPE
+#define __CLC_OUT_ARG3_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#define __CLC_RET_TYPE __CLC_XCONCAT(__CLC_RET_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG1_TYPE __CLC_XCONCAT(__CLC_ARG1_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG2_TYPE __CLC_XCONCAT(__CLC_ARG2_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_OUT_ARG3_TYPE                                                    
\
+  __CLC_XCONCAT(__CLC_OUT_ARG3_SCALAR_TYPE, __CLC_VECSIZE)
+
+#ifndef __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
+#endif
+
+#if __CLC_VECSIZE_OR_1 >= 2
+
+_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE
+__CLC_FUNCTION(__CLC_ARG1_TYPE x, __CLC_ARG2_TYPE y,
+               __CLC_OUT_ARG3_ADDRESS_SPACE __CLC_OUT_ARG3_TYPE *z) {
+  union {
+    __CLC_ARG1_TYPE vec;
+    __CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_x;
+
+  union {
+    __CLC_ARG2_TYPE vec;
+    __CLC_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_y;
+
+  union {
+    __CLC_RET_TYPE vec;
+    __CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result0;
+
+  union {
+    __CLC_OUT_ARG3_TYPE vec;
+    __CLC_OUT_ARG3_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result1;
+
+  u_x.vec = x;
+  u_y.vec = y;
+  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+    u_result0.arr[i] =
+        __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i], &u_result1.arr[i]);
+  }
+
+  *z = u_result1.vec;
+  return u_result0.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
+
+#undef __CLC_RET_TYPE
+#undef __CLC_ARG1_TYPE
+#undef __CLC_ARG2_TYPE
+#undef __CLC_OUT_ARG3_TYPE

diff  --git a/libclc/clc/lib/generic/math/clc_remquo.cl 
b/libclc/clc/lib/generic/math/clc_remquo.cl
index e254093d591d4..502b9e5edc405 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -6,32 +6,58 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#include "clc/math/clc_remquo.h"
+
 #include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
 #include "clc/integer/clc_clz.h"
-#include "clc/internal/clc.h"
-#include "clc/math/clc_floor.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_fabs.h"
 #include "clc/math/clc_flush_if_daz.h"
 #include "clc/math/clc_fma.h"
+#include "clc/math/clc_frexp.h"
 #include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_rint.h"
 #include "clc/math/clc_subnormal_config.h"
 #include "clc/math/clc_trunc.h"
 #include "clc/math/math.h"
-#include "clc/shared/clc_max.h"
+#include "clc/relational/clc_isfinite.h"
+#include "clc/relational/clc_isnan.h"
+#include "clc/relational/clc_signbit.h"
+
+#define __CLC_FUNCTION __clc_remquo_stret
+#define __CLC_BODY "clc_remquo_stret.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FUNCTION __clc_remquo
+#define __CLC_BODY "clc_remquo.inc"
+#include "clc/math/gentype.inc"
 
-#define __CLC_ADDRESS_SPACE private
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
 
-#define __CLC_ADDRESS_SPACE global
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __local
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
 
-#define __CLC_ADDRESS_SPACE local
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __global
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
 
 #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-#define __CLC_ADDRESS_SPACE generic
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
 #endif

diff  --git a/libclc/clc/lib/generic/math/clc_remquo.inc 
b/libclc/clc/lib/generic/math/clc_remquo.inc
index cf8a5ebcea20c..649bdd9ee8b65 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -6,266 +6,19 @@
 //
 
//===----------------------------------------------------------------------===//
 
-_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
-                                          __CLC_ADDRESS_SPACE int *quo) {
-  x = __clc_flush_if_daz(x);
-  y = __clc_flush_if_daz(y);
-  int ux = __clc_as_int(x);
-  int ax = ux & EXSIGNBIT_SP32;
-  float xa = __clc_as_float(ax);
-  int sx = ux ^ ax;
-  int ex = ax >> EXPSHIFTBITS_SP32;
-
-  int uy = __clc_as_int(y);
-  int ay = uy & EXSIGNBIT_SP32;
-  float ya = __clc_as_float(ay);
-  int sy = uy ^ ay;
-  int ey = ay >> EXPSHIFTBITS_SP32;
-
-  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
-  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
-  int c;
-  int k = ex - ey;
-
-  uint q = 0;
-
-  while (k > 0) {
-    c = xr >= yr;
-    q = (q << 1) | c;
-    xr -= c ? yr : 0.0f;
-    xr += xr;
-    --k;
-  }
-
-  c = xr > yr;
-  q = (q << 1) | c;
-  xr -= c ? yr : 0.0f;
-
-  int lt = ex < ey;
-
-  q = lt ? 0 : q;
-  xr = lt ? xa : xr;
-  yr = lt ? ya : yr;
-
-  c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
-  xr -= c ? yr : 0.0f;
-  q += c;
-
-  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
-  xr *= lt ? 1.0f : s;
-
-  int qsgn = sx == sy ? 1 : -1;
-  int quot = (q & 0x7f) * qsgn;
-
-  c = ax == ay;
-  quot = c ? qsgn : quot;
-  xr = c ? 0.0f : xr;
-
-  xr = __clc_as_float(sx ^ __clc_as_int(xr));
-
-  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
-      ay == 0;
-  quot = c ? 0 : quot;
-  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
-
-  *quo = quot;
-
-  return xr;
-}
-
-// remquo signature is special, we don't have macro for this
-#define __CLC_VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE)                        
\
-  _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo(                          
\
-      TYPE##VEC_SIZE x, TYPE##VEC_SIZE y,                                      
\
-      __CLC_ADDRESS_SPACE int##VEC_SIZE *quo) {                                
\
-    int##HALF_VEC_SIZE lo, hi;                                                 
\
-    TYPE##VEC_SIZE ret;                                                        
\
-    ret.lo = __clc_remquo(x.lo, y.lo, &lo);                                    
\
-    ret.hi = __clc_remquo(x.hi, y.hi, &hi);                                    
\
-    (*quo).lo = lo;                                                            
\
-    (*quo).hi = hi;                                                            
\
-    return ret;                                                                
\
+#ifdef __CLC_SCALAR
+#define __CLC_REMQUO_DEF(addrspace)                                            
\
+  _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_remquo(                           
\
+      __CLC_GENTYPE x, __CLC_GENTYPE y, addrspace __CLC_INTN *quo_out) {       
\
+    __CLC_REMQUO_RET_GENTYPE result = __clc_remquo_stret(x, y);                
\
+    *quo_out = result.quo;                                                     
\
+    return result.rem;                                                         
\
   }
 
-#define __CLC_VEC3_REMQUO(TYPE)                                                
\
-  _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo(                                 
\
-      TYPE##3 x, TYPE##3 y, __CLC_ADDRESS_SPACE int##3 * quo) {                
\
-    int2 lo;                                                                   
\
-    int hi;                                                                    
\
-    TYPE##3 ret;                                                               
\
-    ret.s01 = __clc_remquo(x.s01, y.s01, &lo);                                 
\
-    ret.s2 = __clc_remquo(x.s2, y.s2, &hi);                                    
\
-    (*quo).s01 = lo;                                                           
\
-    (*quo).s2 = hi;                                                            
\
-    return ret;                                                                
\
-  }
-__CLC_VEC_REMQUO(float, 2, )
-__CLC_VEC3_REMQUO(float)
-__CLC_VEC_REMQUO(float, 4, 2)
-__CLC_VEC_REMQUO(float, 8, 4)
-__CLC_VEC_REMQUO(float, 16, 8)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
-                                           __CLC_ADDRESS_SPACE int *pquo) {
-  ulong ux = __clc_as_ulong(x);
-  ulong ax = ux & ~SIGNBIT_DP64;
-  ulong xsgn = ux ^ ax;
-  double dx = __clc_as_double(ax);
-  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
-  int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
-  xexp1 = xexp < 1 ? xexp1 : xexp;
-
-  ulong uy = __clc_as_ulong(y);
-  ulong ay = uy & ~SIGNBIT_DP64;
-  double dy = __clc_as_double(ay);
-  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
-  int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
-  yexp1 = yexp < 1 ? yexp1 : yexp;
-
-  int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
-
-  // First assume |x| > |y|
-
-  // Set ntimes to the number of times we need to do a
-  // partial remainder. If the exponent of x is an exact multiple
-  // of 53 larger than the exponent of y, and the mantissa of x is
-  // less than the mantissa of y, ntimes will be one too large
-  // but it doesn't matter - it just means that we'll go round
-  // the loop below one extra time.
-  int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
-  double w = __clc_ldexp(dy, ntimes * 53);
-  w = ntimes == 0 ? dy : w;
-  double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
-
-  // Each time round the loop we compute a partial remainder.
-  // This is done by subtracting a large multiple of w
-  // from x each time, where w is a scaled up version of y.
-  // The subtraction must be performed exactly in quad
-  // precision, though the result at each stage can
-  // fit exactly in a double precision number.
-  int i;
-  double t, v, p, pp;
-
-  for (i = 0; i < ntimes; i++) {
-    // Compute integral multiplier
-    t = __clc_trunc(dx / w);
-
-    // Compute w * t in quad precision
-    p = w * t;
-    pp = __clc_fma(w, t, -p);
-
-    // Subtract w * t from dx
-    v = dx - p;
-    dx = v + (((dx - v) - p) - pp);
-
-    // If t was one too large, dx will be negative. Add back one w.
-    dx += dx < 0.0 ? w : 0.0;
-
-    // Scale w down by 2^(-53) for the next iteration
-    w *= scale;
-  }
-
-  // One more time
-  // Variable todd says whether the integer t is odd or not
-  t = __clc_floor(dx / w);
-  long lt = (long)t;
-  int todd = lt & 1;
-
-  p = w * t;
-  pp = __clc_fma(w, t, -p);
-  v = dx - p;
-  dx = v + (((dx - v) - p) - pp);
-  i = dx < 0.0;
-  todd ^= i;
-  dx += i ? w : 0.0;
-
-  lt -= i;
-
-  // At this point, dx lies in the range [0,dy)
-
-  // For the remainder function, we need to adjust dx
-  // so that it lies in the range (-y/2, y/2] by carefully
-  // subtracting w (== dy == y) if necessary. The rigmarole
-  // with todd is to get the correct sign of the result
-  // when x/y lies exactly half way between two integers,
-  // when we need to choose the even integer.
-
-  int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
-  double dxl = dx - (al ? w : 0.0);
-
-  int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
-  double dxg = dx - (ag ? w : 0.0);
-
-  dx = dy < 0x1.0p+1022 ? dxl : dxg;
-  lt += dy < 0x1.0p+1022 ? al : ag;
-  int quo = ((int)lt & 0x7f) * qsgn;
-
-  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
-  dx = __clc_as_double(ax);
-
-  // Now handle |x| == |y|
-  int c = dx == dy;
-  t = __clc_as_double(xsgn);
-  quo = c ? qsgn : quo;
-  ret = c ? t : ret;
-
-  // Next, handle |x| < |y|
-  c = dx < dy;
-  quo = c ? 0 : quo;
-  ret = c ? x : ret;
-
-  c &= (yexp < 1023 & 2.0 * dx > dy) | (dx > 0.5 * dy);
-  quo = c ? qsgn : quo;
-  // we could use a conversion here instead since qsgn = +-1
-  p = qsgn == 1 ? -1.0 : 1.0;
-  t = __clc_fma(y, p, x);
-  ret = c ? t : ret;
-
-  // We don't need anything special for |x| == 0
-
-  // |y| is 0
-  c = dy == 0.0;
-  quo = c ? 0 : quo;
-  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
-
-  // y is +-Inf, NaN
-  c = yexp > BIASEDEMAX_DP64;
-  quo = c ? 0 : quo;
-  t = y == y ? x : y;
-  ret = c ? t : ret;
-
-  // x is +=Inf, NaN
-  c = xexp > BIASEDEMAX_DP64;
-  quo = c ? 0 : quo;
-  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
-
-  *pquo = quo;
-  return ret;
-}
-__CLC_VEC_REMQUO(double, 2, )
-__CLC_VEC3_REMQUO(double)
-__CLC_VEC_REMQUO(double, 4, 2)
-__CLC_VEC_REMQUO(double, 8, 4)
-__CLC_VEC_REMQUO(double, 16, 8)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y,
-                                         __CLC_ADDRESS_SPACE int *pquo) {
-  return (half)__clc_remquo((float)x, (float)y, pquo);
-}
-__CLC_VEC_REMQUO(half, 2, )
-__CLC_VEC3_REMQUO(half)
-__CLC_VEC_REMQUO(half, 4, 2)
-__CLC_VEC_REMQUO(half, 8, 4)
-__CLC_VEC_REMQUO(half, 16, 8)
-
+__CLC_REMQUO_DEF(private)
+__CLC_REMQUO_DEF(local)
+__CLC_REMQUO_DEF(global)
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+__CLC_REMQUO_DEF(generic)
 #endif
+#endif // __CLC_SCALAR

diff  --git a/libclc/clc/lib/generic/math/clc_remquo_stret.inc 
b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
new file mode 100644
index 0000000000000..eecc25525d8d7
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+#define __CLC_REMQUO_EVAL_TYPE __CLC_GENTYPE
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_GENTYPE
+#define __CLC_S_EVAL_TYPE __CLC_S_GENTYPE
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_S_GENTYPE
+#elif __CLC_FPSIZE == 16
+#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
+#define __CLC_S_EVAL_TYPE __CLC_INTN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
+#endif
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_REMQUO_RET_GENTYPE
+__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  // How many bits of the quotient per iteration
+
+#if __CLC_FPSIZE == 32
+  const __CLC_INTN bits = 12;
+  const __CLC_GENTYPE max_exp = 0x1.0p+127f;
+#elif __CLC_FPSIZE == 64
+  const __CLC_INTN bits = 26;
+  const __CLC_GENTYPE max_exp = 0x1.0p+1023;
+#elif __CLC_FPSIZE == 16
+  const __CLC_INTN bits = 11;
+  const __CLC_GENTYPE max_exp = 0x1.0p+15h;
+#endif
+
+  // Track low 7 bits of the integral quotient.
+  __CLC_INTN q7;
+
+  __CLC_REMQUO_EVAL_TYPE ax = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(x));
+  __CLC_REMQUO_EVAL_TYPE ay = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(y));
+
+  __CLC_GENTYPE ret;
+
+  if (ax > ay) {
+    __CLC_INTN ex, ey;
+
+    __CLC_REMQUO_EVAL_TYPE mx = __clc_frexp(ax, &ex);
+    --ex;
+
+    __CLC_REMQUO_EVAL_TYPE my = __clc_frexp(ay, &ey);
+    --ey;
+
+    ax = __clc_ldexp(mx, bits);
+    ay = __clc_ldexp(my, 1);
+
+    __CLC_INTN nb = ex - ey;
+    __CLC_REMQUO_EVAL_TYPE ayinv = __clc_recip_fast(ay);
+
+    __CLC_INTN qacc = 0;
+
+    while (nb > bits) {
+      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
+
+#if __CLC_FPSIZE == 16
+      ax = __clc_mad(-q, ay, ax);
+#else
+      ax = __clc_fma(-q, ay, ax);
+#endif
+      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
+      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
+      ax = clt ? axp : ax;
+      ax = __clc_ldexp(ax, bits);
+
+      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
+      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
+      qacc = (qacc << bits) | iq;
+
+      nb -= bits;
+    }
+
+    ax = __clc_ldexp(ax, nb - bits + 1);
+
+    __CLC_INTN is_odd;
+
+    // Final iteration
+    {
+      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
+#if __CLC_FPSIZE == 16
+      ax = __clc_mad(-q, ay, ax);
+#else
+      ax = __clc_fma(-q, ay, ax);
+#endif
+
+      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
+      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
+      ax = clt ? axp : ax;
+      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
+      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
+
+      qacc = (qacc << (nb + 1)) | iq;
+      is_odd = (iq & 1) != 0;
+    }
+
+    // Adjust ax so that it is the range (-y/2, y/2]
+    // We need to choose the even integer when x/y is midway between two
+    // integers
+    __CLC_S_EVAL_TYPE aq = ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay) |
+                           (__CLC_CONVERT_S_EVAL_TYPE(is_odd) &
+                            ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax == ay));
+    ax = ax - (aq ? ay : (__CLC_REMQUO_EVAL_TYPE)0.0);
+
+    ax = __clc_ldexp(ax, ey);
+    qacc += aq ? 1 : 0;
+
+    __CLC_S_GENTYPE qneg = __clc_signbit(x) ^ __clc_signbit(y) ? -1 : 0;
+    q7 = ((qacc & 0x7f) ^ qneg) - qneg;
+
+    ret = __clc_signbit(x) ? -ax : ax;
+  } else {
+    __CLC_S_EVAL_TYPE c = (ax > (__CLC_REMQUO_EVAL_TYPE)0.5 * ay);
+    if (__CLC_FPSIZE != 16)
+      c |= (ay < max_exp && (__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay);
+
+    __CLC_CHARN qsgn = __CLC_CONVERT_CHARN(__clc_signbit(x) == 
__clc_signbit(y))
+                           ? (__CLC_CHARN)1
+                           : (__CLC_CHARN)-1;
+
+    __CLC_GENTYPE t = __clc_mad(y, -__CLC_CONVERT_GENTYPE(qsgn), x);
+    ret = c ? t : __clc_flush_if_daz(x);
+    q7 = c ? qsgn : 0;
+
+    __CLC_GENTYPE zero = __clc_copysign(__CLC_FP_LIT(0.0), x);
+    ret = ax == ay ? zero : ret;
+    q7 = ax == ay ? qsgn : q7;
+  }
+
+  ret = y == __CLC_FP_LIT(0.0) ? __CLC_GENTYPE_NAN : ret;
+  q7 = y == __CLC_FP_LIT(0.0) ? 0 : q7;
+
+  __CLC_S_GENTYPE finite = !__clc_isnan(y) && __clc_isfinite(x);
+
+  // A defined 0 result for quo with a nan result is an additional OpenCL
+  // requirement beyond standard C.
+  __CLC_REMQUO_RET_GENTYPE result;
+  result.quo = finite ? q7 : 0;
+  result.rem = finite ? ret : __CLC_GENTYPE_NAN;
+
+  return result;
+}
+
+#undef __CLC_REMQUO_EVAL_TYPE
+#undef __CLC_CONVERT_REMQUO_EVAL_TYPE
+#undef __CLC_S_EVAL_TYPE
+#undef __CLC_CONVERT_S_EVAL_TYPE
+
+#endif // __CLC_SCALAR


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to