================
@@ -6,10 +6,17 @@
//
//===----------------------------------------------------------------------===//
-// On AMDGPU the "fast" reciprocal is the hardware v_rcp_f32 approximation,
+// On AMDGPU the "fast" reciprocal is the hardware v_rcp approximation.
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_recip_fast(__CLC_GENTYPE x) {
#if defined(__CLC_SCALAR) && __CLC_FPSIZE == 32
return __builtin_amdgcn_rcpf(x);
+#elif defined(__CLC_SCALAR) && __CLC_FPSIZE == 64
+ // Hardware v_rcp_f64 seed refined with two Newton-Raphson iterations. This
+ // computes 1.0 / x without the full IEEE scaling and subnormal fixups.
+ __CLC_GENTYPE r = __builtin_amdgcn_rcp(x);
+ r = __builtin_fma(__builtin_fma(-x, r, 1.0), r, r);
+ r = __builtin_fma(__builtin_fma(-x, r, 1.0), r, r);
+ return r;
----------------
jhuber6 wrote:
Doesn't seem to happen in practice. I'm guessing it's a `-ffast-math` thing but
we probably don't want to enable that for math itself?
https://github.com/llvm/llvm-project/pull/203809
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits