[llvm-branch-commits] [llvm] AMDGPU: Avoid introducing unnecessary fabs in fast fdiv lowering (PR #172553)

Matt Arsenault via llvm-branch-commits Tue, 16 Dec 2025 12:58:05 -0800

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/172553


If the sign bit of the denominator is known 0, do not emit the fabs.
Also, extend this to handle min/max with fabs inputs.

I originally tried to do this as the general combine on fabs, but
it proved to be too much trouble at this time. This is mostly
complexity introduced by expanding the various min/maxes into
canonicalizes, and then not being able to assume the sign bit
of canonicalize (fabs x) without nnan.

This defends against future code size regressions in the atan2 and
atan2pi library functions.

>From ae61164d50b4b08e6b8527c52fc35124f6016b9b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Tue, 16 Dec 2025 19:28:19 +0100
Subject: [PATCH] AMDGPU: Avoid introducing unnecessary fabs in fast fdiv
 lowering

If the sign bit of the denominator is known 0, do not emit the fabs.
Also, extend this to handle min/max with fabs inputs.

I originally tried to do this as the general combine on fabs, but
it proved to be too much trouble at this time. This is mostly
complexity introduced by expanding the various min/maxes into
canonicalizes, and then not being able to assume the sign bit
of canonicalize (fabs x) without nnan.

This defends against future code size regressions in the atan2 and
atan2pi library functions.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp    | 15 +++++++++++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp         |  5 ++++-
 ...bs-known-signbit-combine-fast-fdiv-lowering.ll | 12 ++++++------
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 69491c6f2c565..4482df15242d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2791,6 +2791,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned 
Depth) const {
   return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
 }
 
+// TODO: Should have argument to specify if sign bit of nan is ignorable.
 bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
   if (Depth >= MaxRecursionDepth)
     return false; // Limit search depth.
@@ -2812,6 +2813,20 @@ bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned 
Depth) const {
   case ISD::FEXP2:
   case ISD::FEXP10:
     return Op->getFlags().hasNoNaNs();
+  case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMINIMUMNUM:
+    return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) &&
+           SignBitIsZeroFP(Op.getOperand(0), Depth + 1);
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMAXIMUM:
+  case ISD::FMAXIMUMNUM:
+    // TODO: If we can ignore the sign bit of nans, only one side being known 0
+    // is sufficient.
+    return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) &&
+           SignBitIsZeroFP(Op.getOperand(0), Depth + 1);
   default:
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ff50fdfe9b09f..afdeed658b76e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12336,7 +12336,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, 
SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
 
-  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
+  // TODO: The combiner should probably handle elimination of redundant fabs.
+  SDValue r1 = DAG.SignBitIsZeroFP(RHS)
+                   ? RHS
+                   : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
 
   const APFloat K0Val(0x1p+96f);
   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
diff --git 
a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll 
b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
index 038252e4cb1e4..750f390e79110 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
@@ -73,7 +73,7 @@ define float 
@fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs(float %x, float %
 ; CHECK-NEXT:    v_max_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
@@ -97,7 +97,7 @@ define float 
@fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs(float %x, float %
 ; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
@@ -122,7 +122,7 @@ define float 
@fdiv_fast_daz_rhs_signbit_known_zero_maximum_fabs(float %x, float
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
@@ -147,7 +147,7 @@ define float 
@fdiv_fast_daz_rhs_signbit_known_zero_minimum_fabs(float %x, float
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
@@ -171,7 +171,7 @@ define float 
@fdiv_fast_daz_rhs_signbit_known_zero_maximumnum_fabs(float %x, flo
 ; CHECK-NEXT:    v_max_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
@@ -195,7 +195,7 @@ define float 
@fdiv_fast_daz_rhs_signbit_known_zero_minimumnum_fabs(float %x, flo
 ; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU: Avoid introducing unnecessary fabs in fast fdiv lowering (PR #172553)

Reply via email to