================
@@ -694,29 +693,11 @@ Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<>
&Builder, Value *X,
return Builder.CreateFMA(Y0E, EFMA, Y0);
}
-bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
- FastMathFlags DivFMF,
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
FastMathFlags SqrtFMF) const
{
- // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
- if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
- return false;
-
- Type *EltTy = SqrtOp->getType()->getScalarType();
- switch (EltTy->getTypeID()) {
- case Type::FloatTyID:
- // v_rsq_f32 gives 1ulp
- // Separate correctly rounded fdiv + sqrt give ~1.81 ulp.
-
- // FIXME: rsq formation should not depend on approx func or the fpmath
- // accuracy. This strictly improves precision.
- return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
- case Type::DoubleTyID:
- return true;
- default:
- return false;
- }
-
- llvm_unreachable("covered switch");
+ // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
+ // f64.
+ return DivFMF.allowContract() && SqrtFMF.allowContract();
----------------
dtcxzyw wrote:
How about rsq.f16? IIRC the f16 path doesn't exist, right?
https://github.com/llvm/llvm-project/pull/172082
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits