https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/172553
If the sign bit of the denominator is known 0, do not emit the fabs. Also, extend this to handle min/max with fabs inputs. I originally tried to do this as the general combine on fabs, but it proved to be too much trouble at this time. This is mostly complexity introduced by expanding the various min/maxes into canonicalizes, and then not being able to assume the sign bit of canonicalize (fabs x) without nnan. This defends against future code size regressions in the atan2 and atan2pi library functions. >From ae61164d50b4b08e6b8527c52fc35124f6016b9b Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Tue, 16 Dec 2025 19:28:19 +0100 Subject: [PATCH] AMDGPU: Avoid introducing unnecessary fabs in fast fdiv lowering If the sign bit of the denominator is known 0, do not emit the fabs. Also, extend this to handle min/max with fabs inputs. I originally tried to do this as the general combine on fabs, but it proved to be too much trouble at this time. This is mostly complexity introduced by expanding the various min/maxes into canonicalizes, and then not being able to assume the sign bit of canonicalize (fabs x) without nnan. This defends against future code size regressions in the atan2 and atan2pi library functions. --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 15 +++++++++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++++- ...bs-known-signbit-combine-fast-fdiv-lowering.ll | 12 ++++++------ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 69491c6f2c565..4482df15242d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2791,6 +2791,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth); } +// TODO: Should have argument to specify if sign bit of nan is ignorable. bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const { if (Depth >= MaxRecursionDepth) return false; // Limit search depth. @@ -2812,6 +2813,20 @@ bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const { case ISD::FEXP2: case ISD::FEXP10: return Op->getFlags().hasNoNaNs(); + case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMINIMUM: + case ISD::FMINIMUMNUM: + return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) && + SignBitIsZeroFP(Op.getOperand(0), Depth + 1); + case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMAXIMUMNUM: + // TODO: If we can ignore the sign bit of nans, only one side being known 0 + // is sufficient. + return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) && + SignBitIsZeroFP(Op.getOperand(0), Depth + 1); default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ff50fdfe9b09f..afdeed658b76e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12336,7 +12336,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); + // TODO: The combiner should probably handle elimination of redundant fabs. + SDValue r1 = DAG.SignBitIsZeroFP(RHS) + ? RHS + : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); const APFloat K0Val(0x1p+96f); const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); diff --git a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll index 038252e4cb1e4..750f390e79110 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll @@ -73,7 +73,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs(float %x, float % ; CHECK-NEXT: v_max_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -97,7 +97,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs(float %x, float % ; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -122,7 +122,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maximum_fabs(float %x, float ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -147,7 +147,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minimum_fabs(float %x, float ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -171,7 +171,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maximumnum_fabs(float %x, flo ; CHECK-NEXT: v_max_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -195,7 +195,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minimumnum_fabs(float %x, flo ; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
