https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/165011
>From 01e872d95c1708392ae429879f36f6a32ca4889a Mon Sep 17 00:00:00 2001 From: Guy David <[email protected]> Date: Fri, 24 Oct 2025 19:30:19 +0300 Subject: [PATCH] [DAGCombiner] Relax nsz constraint for FP optimizations Some floating-point optimization don't trigger because they can produce incorrect results around signed zeros, and rely on the existence of the nsz flag which commonly appears when fast-math is enabled. However, this flag is not a hard requirement when all of the users of the combined value are either guaranteed to overwrite the sign-bit or simply ignore it (comparisons, etc.). The optimizations affected: - fadd x, +0.0 -> x - fsub x, -0.0 -> x - fsub +0.0, x -> fneg x - fdiv(x, sqrt(x)) -> sqrt(x) - frem lowering with power-of-2 divisors --- llvm/include/llvm/CodeGen/SelectionDAG.h | 6 ++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 +++-- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 40 +++++++++++ .../CodeGen/AArch64/ignore-signed-zero.ll | 72 +++++++++++++++++++ .../AMDGPU/fcanonicalize-elimination.ll | 2 +- llvm/test/CodeGen/AMDGPU/swdev380865.ll | 5 +- 6 files changed, 132 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/ignore-signed-zero.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index b024e8a68bd6e..9dba2ee8692f5 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2326,6 +2326,12 @@ class SelectionDAG { /// +nan are considered positive, -0.0, -inf and -nan are not. LLVM_ABI bool cannotBeOrderedNegativeFP(SDValue Op) const; + /// Check if a use of a float value is insensitive to signed zeros. + LLVM_ABI bool canIgnoreSignBitOfZero(const SDUse &Use) const; + + /// Check if at most two uses of a value are insensitive to signed zeros. + LLVM_ABI bool canIgnoreSignBitOfZero(SDValue Op) const; + /// Test whether two SDValues are known to compare equal. This /// is true if they are the same value, or if one is negative zero and the /// other positive zero. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c9513611e6dcb..3624748a3b0f0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17869,7 +17869,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); if (N1C && N1C->isZero()) - if (N1C->isNegative() || Flags.hasNoSignedZeros()) + if (N1C->isNegative() || Flags.hasNoSignedZeros() || + DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -18081,7 +18082,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) { - if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) { + if (!N1CFP->isNegative() || Flags.hasNoSignedZeros() || + DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) { return N0; } } @@ -18094,7 +18096,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // (fsub -0.0, N1) -> -N1 if (N0CFP && N0CFP->isZero()) { - if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) { + if (N0CFP->isNegative() || Flags.hasNoSignedZeros() || + DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) { // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are // flushed to zero, unless all users treat denorms as zero (DAZ). // FIXME: This transform will change the sign of a NaN and the behavior @@ -18744,7 +18747,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } // Fold X/Sqrt(X) -> Sqrt(X) - if (Flags.hasNoSignedZeros() && Flags.hasAllowReassociation()) + if ((Flags.hasNoSignedZeros() || DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) && + Flags.hasAllowReassociation()) if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0)) return N1; @@ -18795,8 +18799,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { TLI.isOperationLegalOrCustom(ISD::FDIV, VT) && TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) && DAG.isKnownToBeAPowerOfTwoFP(N1)) { - bool NeedsCopySign = - !Flags.hasNoSignedZeros() && !DAG.cannotBeOrderedNegativeFP(N0); + bool NeedsCopySign = !Flags.hasNoSignedZeros() && + !DAG.cannotBeOrderedNegativeFP(N0) && + !DAG.canIgnoreSignBitOfZero(SDValue(N, 0)); SDValue Div = DAG.getNode(ISD::FDIV, DL, VT, N0, N1); SDValue Rnd = DAG.getNode(ISD::FTRUNC, DL, VT, Div); SDValue MLA; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c2b4c19846316..64fd925684ffa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6118,6 +6118,46 @@ bool SelectionDAG::cannotBeOrderedNegativeFP(SDValue Op) const { llvm_unreachable("covered opcode switch"); } +bool SelectionDAG::canIgnoreSignBitOfZero(const SDUse &Use) const { + assert(Use.getValueType().isFloatingPoint()); + const SDNode *User = Use.getUser(); + unsigned OperandNo = Use.getOperandNo(); + // Check if this use is insensitive to the sign of zero + switch (User->getOpcode()) { + case ISD::SETCC: + // Comparisons: IEEE-754 specifies +0.0 == -0.0. + case ISD::FABS: + // fabs always produces +0.0. + return true; + case ISD::FCOPYSIGN: + // copysign overwrites the sign bit of the first operand. + return OperandNo == 0; + case ISD::FADD: + case ISD::FSUB: { + // Arithmetic with non-zero constants fixes the uncertainty around the + // sign bit. + SDValue Other = User->getOperand(1 - OperandNo); + return isKnownNeverZeroFloat(Other); + } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + // fp-to-int conversions normalize signed zeros. + return true; + default: + return false; + } +} + +bool SelectionDAG::canIgnoreSignBitOfZero(SDValue Op) const { + // FIXME: Limit the amount of checked uses to not introduce a compile-time + // regression. Ideally, this should be implemented as a demanded-bits + // optimization that stems from the users. + if (Op->use_size() > 2) + return false; + return all_of(Op->uses(), + [&](const SDUse &Use) { return canIgnoreSignBitOfZero(Use); }); +} + bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { // Check the obvious case. if (A == B) return true; diff --git a/llvm/test/CodeGen/AArch64/ignore-signed-zero.ll b/llvm/test/CodeGen/AArch64/ignore-signed-zero.ll new file mode 100644 index 0000000000000..3b17e410ac380 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ignore-signed-zero.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + +; Test that nsz constraint can be bypassed when all uses are sign-insensitive. + +define i1 @test_fadd_neg_zero_fcmp(float %x) { +; CHECK-LABEL: test_fadd_neg_zero_fcmp: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %add = fadd float %x, -0.0 + %cmp = fcmp oeq float %add, 1.0 + ret i1 %cmp +} + +define float @test_fsub_zero_fabs(float %x) { +; CHECK-LABEL: test_fsub_zero_fabs: +; CHECK: // %bb.0: +; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: ret + %sub = fsub float %x, 0.0 + %abs = call float @llvm.fabs.f32(float %sub) + ret float %abs +} + +define float @test_fsub_neg_zero_copysign(float %x, float %y) { +; CHECK-LABEL: test_fsub_neg_zero_copysign: +; CHECK: // %bb.0: +; CHECK-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret + %sub = fsub float -0.0, %x + %copysign = call float @llvm.copysign.f32(float %sub, float %y) + ret float %copysign +} + +define i1 @test_div_sqrt_fcmp(float %x) { +; CHECK-LABEL: test_div_sqrt_fcmp: +; CHECK: // %bb.0: +; CHECK-NEXT: fsqrt s0, s0 +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: cset w0, gt +; CHECK-NEXT: ret + %sqrt = call float @llvm.sqrt.f32(float %x) + %div = fdiv reassoc float %x, %sqrt + %cmp = fcmp ogt float %div, 0.0 + ret i1 %cmp +} + +define float @test_frem_fabs(float %x) { +; CHECK-LABEL: test_frem_fabs: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, #0.50000000 +; CHECK-NEXT: fmov s2, #-2.00000000 +; CHECK-NEXT: fmul s1, s0, s1 +; CHECK-NEXT: frintz s1, s1 +; CHECK-NEXT: fmadd s0, s1, s2, s0 +; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: ret + %rem = frem float %x, 2.0 + %abs = call float @llvm.fabs.f32(float %rem) + ret float %abs +} + +declare float @llvm.fabs.f32(float) +declare float @llvm.copysign.f32(float, float) +declare float @llvm.sqrt.f32(float) diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index 05d3e9c381910..1b8ff6b688c19 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -371,7 +371,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1 %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 - %v0 = fadd float %load, 0.0 + %v0 = fadd float %load, 1.0 %v = tail call float @llvm.fabs.f32(float %v0) %canonicalized = tail call float @llvm.canonicalize.f32(float %v) store float %canonicalized, ptr addrspace(1) %gep, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index d4a8a0d762afd..1130c465c15e3 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -28,14 +28,13 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x40140000 -; CHECK-NEXT: s_add_i32 s1, s1, s0 -; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x40180000 +; CHECK-NEXT: s_add_i32 s1, s1, s0 +; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
