https://github.com/guy-david created https://github.com/llvm/llvm-project/pull/167313
FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed when one element (usually the 4th) or more are known to be zero. Before: ``` movi d1, #0000000000000000 mov v0.s[3], v1.s[0] faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s ``` After: ``` mov s1, v0.s[2] faddp s0, v0.2s fadd s0, s0, s1 ``` When all of the elements are zero, the intrinsic now simply reduces into a constant instead of emitting two additions. >From 7a1406efe148f888a784851bd4268d227041d588 Mon Sep 17 00:00:00 2001 From: Guy David <[email protected]> Date: Sun, 9 Nov 2025 17:55:53 +0200 Subject: [PATCH] [AArch64] Decompose faddv with known zero elements FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed when one element (usually the 4th) or more are known to be zero. Before: movi d1, #0000000000000000 mov v0.s[3], v1.s[0] faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s After: mov s1, v0.s[2] faddp s0, v0.2s fadd s0, s0, s1 --- .../Target/AArch64/AArch64ISelLowering.cpp | 58 +++++++++++++ llvm/test/CodeGen/AArch64/faddv.ll | 82 +++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/faddv.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 132afc27135e9..b4bf97e27bca4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22790,6 +22790,62 @@ static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) { } } +/// Optimize patterns where we insert zeros into vector lanes before faddv. +static SDValue tryCombineFADDVWithZero(SDNode *N, SelectionDAG &DAG) { + assert(getIntrinsicID(N) == Intrinsic::aarch64_neon_faddv && + "Expected NEON faddv intrinsic"); + SDLoc DL(N); + SDValue Vec = N->getOperand(1); + EVT VT = Vec.getValueType(); + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnes(NumElts); + APInt KnownZeroElts = DAG.computeVectorKnownZeroElements(Vec, DemandedElts); + unsigned NumZeroElts = KnownZeroElts.popcount(); + // No element is known to be +0.0, fallback to the TableGen pattern. + if (NumZeroElts == 0) + return SDValue(); + // All elements are +0.0, just return zero. + if (NumZeroElts == NumElts) + return DAG.getConstantFP(0.0, DL, EltVT); + + // At least one element is +0.0, so it is worth to decompose the reduction + // into fadd's. FADDV is a pairwise reduction, so we need to respect the + // order of the elements in the vector. + + // Check if we can output a signed zero. + // This avoid the scenario where all the added values are -0.0 except the +0.0 + // element we chose to ignore. + SDNodeFlags Flags = N->getFlags(); + bool IsSignedZeroSafe = Flags.hasNoSignedZeros() || + DAG.allUsesSignedZeroInsensitive(SDValue(N, 0)); + if (!IsSignedZeroSafe) + return SDValue(); + + // Extract all elements. + SmallVector<SDValue, 4> Elts; + for (unsigned I = 0; I < NumElts; I++) { + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, + DAG.getConstant(I, DL, MVT::i64))); + } + // Perform pairwise reduction. + while (Elts.size() > 1) { + SmallVector<SDValue, 2> NewElts; + for (unsigned I = 0; I < Elts.size(); I += 2) { + if (!KnownZeroElts[I] && !KnownZeroElts[I + 1]) { + NewElts.push_back( + DAG.getNode(ISD::FADD, DL, EltVT, Elts[I], Elts[I + 1])); + } else if (KnownZeroElts[I]) { + NewElts.push_back(Elts[I + 1]); + } else if (KnownZeroElts[I + 1]) { + NewElts.push_back(Elts[I]); + } + } + Elts = std::move(NewElts); + } + return Elts[0]; +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -22813,6 +22869,8 @@ static SDValue performIntrinsicCombine(SDNode *N, return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); + case Intrinsic::aarch64_neon_faddv: + return tryCombineFADDVWithZero(N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); diff --git a/llvm/test/CodeGen/AArch64/faddv.ll b/llvm/test/CodeGen/AArch64/faddv.ll new file mode 100644 index 0000000000000..e4a3781150cf7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/faddv.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s + +; Test element at index 0 is zero. +define float @test_v2f32_element_0_zero(<2 x float> %vec) { +; CHECK-LABEL: test_v2f32_element_0_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov s0, v0.s[1] +; CHECK-NEXT: ret +entry: + %with_zero = insertelement <2 x float> %vec, float 0.0, i64 0 + %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %with_zero) + ret float %sum +} + +; Test element at index 3 is zero. +define float @test_v4f32_element_3_zero(<4 x float> %vec) { +; CHECK-LABEL: test_v4f32_element_3_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov s1, v0.s[2] +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: ret +entry: + %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3 + %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero) + %abs = call float @llvm.fabs.f32(float %sum) + ret float %abs +} + +; Test elements at index 0 and 2 are zero. +define float @test_v4f32_elements_0_2_zero(<4 x float> %vec) { +; CHECK-LABEL: test_v4f32_elements_0_2_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: ret +entry: + %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0 + %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2 + %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero2) + %abs = call float @llvm.fabs.f32(float %sum) + ret float %abs +} + +; Test all elements are zero. +define float @test_v4f32_all_zero(<4 x float> %vec) { +; CHECK-LABEL: test_v4f32_all_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ret +entry: + %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0 + %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1 + %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2 + %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3 + %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero4) + ret float %sum +} + +; Test element at index 0 is zero. +define double @test_v2f64_element_0_zero(<2 x double> %vec) { +; CHECK-LABEL: test_v2f64_element_0_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: fabs d0, d0 +; CHECK-NEXT: ret +entry: + %with_zero = insertelement <2 x double> %vec, double 0.0, i64 0 + %sum = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %with_zero) + %abs = call double @llvm.fabs.f64(double %sum) + ret double %abs +} + +declare float @llvm.fabs.f32(float) +declare double @llvm.fabs.f64(double) + +declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float>) +declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float>) +declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double>) _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
