https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/149736
>From 104183777e3a6bd14f209c74da53e6f592b72d9f Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Fri, 18 Jul 2025 22:58:19 +0200 Subject: [PATCH 1/3] [LV] Vectorize maxnum/minnum w/o fast-math flags. (#148239) Update LV to vectorize maxnum/minnum reductions without fast-math flags, by adding an extra check in the loop if any inputs to maxnum/minnum are NaN, due to maxnum/minnum behavior w.r.t to signaling NaNs. Signed-zeros are already handled consistently by maxnum/minnum. If any input is NaN, *exit the vector loop, *compute the reduction result up to the vector iteration that contained NaN inputs and * resume in the scalar loop New recurrence kinds are added for reductions using maxnum/minnum without fast-math flags. PR: https://github.com/llvm/llvm-project/pull/148239 (cherry picked from commit 004c67ea257039e4e98abc26dd4ac6e8f3d7a171) --- llvm/include/llvm/Analysis/IVDescriptors.h | 3 + llvm/lib/Analysis/IVDescriptors.cpp | 26 +- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +- .../Vectorize/LoopVectorizationPlanner.h | 12 +- .../Transforms/Vectorize/LoopVectorize.cpp | 18 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 6 + .../Transforms/Vectorize/VPlanAnalysis.cpp | 1 + .../Vectorize/VPlanConstruction.cpp | 160 +++++++++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +- .../Transforms/Vectorize/VPlanTransforms.h | 6 + .../AArch64/fmax-without-fast-math-flags.ll | 55 +++- .../AArch64/fmin-without-fast-math-flags.ll | 55 +++- ...fmax-without-fast-math-flags-interleave.ll | 55 +++- .../fmax-without-fast-math-flags.ll | 272 +++++++++++++++++- .../fmin-without-fast-math-flags.ll | 94 +++++- .../LoopVectorize/minmax_reduction.ll | 8 +- 16 files changed, 731 insertions(+), 58 deletions(-) diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index b985292ccee40..1dc73205a0ebb 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -47,6 +47,8 @@ enum class RecurKind { FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). + FMinNum, ///< FP min with llvm.minnum semantics including NaNs. + FMaxNum, ///< FP max with llvm.maxnum semantics including NaNs. FMinimum, ///< FP min with llvm.minimum semantics FMaximum, ///< FP max with llvm.maximum semantics FMinimumNum, ///< FP min with llvm.minimumnum semantics @@ -250,6 +252,7 @@ class RecurrenceDescriptor { /// Returns true if the recurrence kind is a floating-point min/max kind. static bool isFPMinMaxRecurrenceKind(RecurKind Kind) { return Kind == RecurKind::FMin || Kind == RecurKind::FMax || + Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum || Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 39f74beca082f..8be5de3bf356f 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) || match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value())); }; - if (isIntMinMaxRecurrenceKind(Kind) || - (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind))) + if (isIntMinMaxRecurrenceKind(Kind)) return isMinMaxPattern(I, Kind, Prev); - else if (isFMulAddIntrinsic(I)) + if (isFPMinMaxRecurrenceKind(Kind)) { + InstDesc Res = isMinMaxPattern(I, Kind, Prev); + if (!Res.isRecurrence()) + return InstDesc(false, I); + if (HasRequiredFMF()) + return Res; + // We may be able to vectorize FMax/FMin reductions using maxnum/minnum + // intrinsics with extra checks ensuring the vector loop handles only + // non-NaN inputs. + if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) { + assert(Kind == RecurKind::FMax && + "unexpected recurrence kind for maxnum"); + return InstDesc(I, RecurKind::FMaxNum); + } + if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) { + assert(Kind == RecurKind::FMin && + "unexpected recurrence kind for minnum"); + return InstDesc(I, RecurKind::FMinNum); + } + return InstDesc(false, I); + } + if (isFMulAddIntrinsic(I)) return InstDesc(Kind == RecurKind::FMulAdd, I, I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 200d1fb854155..e7623aaff105d 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { case RecurKind::UMin: return Intrinsic::vector_reduce_umin; case RecurKind::FMax: + case RecurKind::FMaxNum: return Intrinsic::vector_reduce_fmax; case RecurKind::FMin: + case RecurKind::FMinNum: return Intrinsic::vector_reduce_fmin; case RecurKind::FMaximum: return Intrinsic::vector_reduce_fmaximum; @@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) { case RecurKind::SMax: return Intrinsic::smax; case RecurKind::FMin: + case RecurKind::FMinNum: return Intrinsic::minnum; case RecurKind::FMax: + case RecurKind::FMaxNum: return Intrinsic::maxnum; case RecurKind::FMinimum: return Intrinsic::minimum; @@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { Type *Ty = Left->getType(); if (Ty->isIntOrIntVectorTy() || - (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || + (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum || + RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) { - // TODO: Add float minnum/maxnum support when FMF nnan is set. Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK); return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr, "rdx.minmax"); @@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, case RecurKind::UMin: case RecurKind::FMax: case RecurKind::FMin: + case RecurKind::FMinNum: + case RecurKind::FMaxNum: case RecurKind::FMinimum: case RecurKind::FMaximum: case RecurKind::FMinimumNum: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 11853859484e3..f57ce0c3ccb4d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -230,7 +230,6 @@ class VPBuilder { /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A /// and \p B. - /// TODO: add createFCmp when needed. VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { @@ -240,6 +239,17 @@ class VPBuilder { new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); } + /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A + /// and \p B. + VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { + assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && + Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); + return tryInsertInstruction( + new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + } + VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 06db89a89bc38..74f59a2f7f136 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4345,10 +4345,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { - // Cross iteration phis such as reductions need special handling and are - // currently unsupported. - if (any_of(OrigLoop->getHeader()->phis(), - [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) + // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum + // reductions need special handling and are currently unsupported. + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { + if (!Legal->isReductionVariable(&Phi)) + return Legal->isFixedOrderRecurrence(&Phi); + RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + })) return false; // Phis with uses outside of the loop require special handling and are @@ -8817,6 +8821,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + // Apply mandatory transformation to handle FP maxnum/minnum reduction with + // NaNs if possible, bail out otherwise. + if (!VPlanTransforms::runPass( + VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath, *Plan)) + return nullptr; + // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 31aec77db63c1..f6610ea5b333f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -23196,6 +23196,8 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: @@ -23333,6 +23335,8 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: @@ -23435,6 +23439,8 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b27a7ffeed208..66657b98b094b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return ResTy; } case Instruction::ICmp: + case Instruction::FCmp: case VPInstruction::ActiveLaneMask: assert(inferScalarType(R->getOperand(0)) == inferScalarType(R->getOperand(1)) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 52eecb000d0c2..c71d70935b449 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -628,3 +628,163 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, Term->addMetadata(LLVMContext::MD_prof, BranchWeights); } } + +bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) { + auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { + auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( + RedPhiR->getBackedgeValue()->getDefiningRecipe()); + if (!MinMaxR) + return nullptr; + + auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR); + if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) && + !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr()))) + return nullptr; + +#ifndef NDEBUG + Intrinsic::ID RdxIntrinsicId = + RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum + : Intrinsic::minnum; + assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) && + cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() == + RdxIntrinsicId) || + (RepR && + cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() == + RdxIntrinsicId) && + "Intrinsic did not match recurrence kind"); +#endif + + if (MinMaxR->getOperand(0) == RedPhiR) + return MinMaxR->getOperand(1); + + assert(MinMaxR->getOperand(1) == RedPhiR && + "Reduction phi operand expected"); + return MinMaxR->getOperand(0); + }; + + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPReductionPHIRecipe *RedPhiR = nullptr; + bool HasUnsupportedPhi = false; + for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { + if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R)) + continue; + auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R); + if (!Cur) { + // TODO: Also support fixed-order recurrence phis. + HasUnsupportedPhi = true; + continue; + } + // For now, only a single reduction is supported. + // TODO: Support multiple MaxNum/MinNum reductions and other reductions. + if (RedPhiR) + return false; + if (Cur->getRecurrenceKind() != RecurKind::FMaxNum && + Cur->getRecurrenceKind() != RecurKind::FMinNum) { + HasUnsupportedPhi = true; + continue; + } + RedPhiR = Cur; + } + + if (!RedPhiR) + return true; + + // We won't be able to resume execution in the scalar tail, if there are + // unsupported header phis or there is no scalar tail at all, due to + // tail-folding. + if (HasUnsupportedPhi || !Plan.hasScalarTail()) + return false; + + VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR); + if (!MinMaxOp) + return false; + + RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); + assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && + "unsupported reduction"); + + /// Check if the vector loop of \p Plan can early exit and restart + /// execution of last vector iteration in the scalar loop. This requires all + /// recipes up to early exit point be side-effect free as they are + /// re-executed. Currently we check that the loop is free of any recipe that + /// may write to memory. Expected to operate on an early VPlan w/o nested + /// regions. + for (VPBlockBase *VPB : vp_depth_first_shallow( + Plan.getVectorLoopRegion()->getEntryBasicBlock())) { + auto *VPBB = cast<VPBasicBlock>(VPB); + for (auto &R : *VPBB) { + if (R.mayWriteToMemory() && + !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + return false; + } + } + + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator()); + assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + "Unexpected terminator"); + auto *IsLatchExitTaken = + Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + + VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp); + VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN}); + auto *AnyExitTaken = + Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + LatchExitingBranch->eraseFromParent(); + + // If we exit early due to NaNs, compute the final reduction result based on + // the reduction phi at the beginning of the last vector iteration. + auto *RdxResult = find_singleton<VPSingleDefRecipe>( + RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * { + auto *VPI = dyn_cast<VPInstruction>(U); + if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult) + return VPI; + return nullptr; + }); + + auto *MiddleVPBB = Plan.getMiddleBlock(); + Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin()); + auto *NewSel = + Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1)); + RdxResult->setOperand(1, NewSel); + + auto *ScalarPH = Plan.getScalarPreheader(); + // Update resume phis for inductions in the scalar preheader. If AnyNaN is + // true, the resume from the start of the last vector iteration via the + // canonical IV, otherwise from the original value. + for (auto &R : ScalarPH->phis()) { + auto *ResumeR = cast<VPPhi>(&R); + VPValue *VecV = ResumeR->getOperand(0); + if (VecV == RdxResult) + continue; + if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) { + if (DerivedIV->getNumUsers() == 1 && + DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) { + auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), + &Plan.getVectorTripCount()); + DerivedIV->moveAfter(&*Builder.getInsertPoint()); + DerivedIV->setOperand(1, NewSel); + continue; + } + } + // Bail out and abandon the current, partially modified, VPlan if we + // encounter resume phi that cannot be updated yet. + if (VecV != &Plan.getVectorTripCount()) { + LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with " + "FMaxNum/FMinNum reduction.\n"); + return false; + } + auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV); + ResumeR->setOperand(0, NewSel); + } + + auto *MiddleTerm = MiddleVPBB->getTerminator(); + Builder.setInsertPoint(MiddleTerm); + VPValue *MiddleCond = MiddleTerm->getOperand(0); + VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN)); + MiddleTerm->setOperand(0, NewCond); + return true; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1664bcc3881aa..57b713d3dfcb9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); return Builder.CreateFreeze(Op, Name); } + case Instruction::FCmp: case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Res = State.get(getOperand(0)); for (VPValue *Op : drop_begin(operands())) Res = Builder.CreateOr(Res, State.get(Op)); - return Builder.CreateOrReduce(Res); + return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::FirstActiveLane: { if (getNumOperands() == 1) { @@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { switch (getOpcode()) { case Instruction::ExtractElement: case Instruction::Freeze: + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return Op == getOperand(1); case Instruction::PHI: return true; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case Instruction::Or: @@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { switch (getOpcode()) { default: return false; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: return vputils::onlyFirstPartUsed(this); @@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::ZExt; break; case OperationType::Cmp: - return Opcode == Instruction::ICmp; + return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; case OperationType::Other: return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 870b1bb68b79a..4d1752fe57565 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,6 +99,12 @@ struct VPlanTransforms { /// not valid. static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do, + /// try to update the vector loop to exit early if any input is NaN and resume + /// executing in the scalar loop to handle the NaNs there. Return false if + /// this attempt was unsuccessful. + static bool handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan); + /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index 451574a258c2b..427a05cc1c843 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll index e93ee5563b057..1a8e5940d88e7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll @@ -42,18 +42,59 @@ define float @fminnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fminnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index b2e080fef2e57..a2eddad179216 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll index 5661406b88a5a..1ca5586942d7c 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll @@ -192,18 +192,51 @@ define float @fmaxnum_1(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum_1( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -227,18 +260,234 @@ define float @fmaxnum_2(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum_2( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 4 + %max.next = call float @llvm.maxnum.f32(float %max, float %l) + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret float %max.next +} + +define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) { +; CHECK-LABEL: define float @fmaxnum_induction_starts_at_10( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -10 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = add i64 10, [[INDEX]] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = add i64 10, [[TMP9]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 10, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop ] + %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 4 + %max.next = call float @llvm.maxnum.f32(float %l, float %max) + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret float %max.next +} + +define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) { +; CHECK-LABEL: define float @fmaxnum_induction_starts_at_value( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[START:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[START]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = add i64 [[START]], [[INDEX]] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[START]], [[TMP9]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop ] + %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 4 + %max.next = call float @llvm.maxnum.f32(float %l, float %max) + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret float %max.next +} + +define float @fmaxnum_with_additional_add(ptr noalias %src, ptr noalias %src.2, i64 %n) { +; CHECK-LABEL: define float @fmaxnum_with_additional_add( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC_2]], i64 [[IV]] +; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[L_SRC_2]] ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 -; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: store i32 [[SUM_NEXT_LCSSA]], ptr [[SRC_2]], align 4 ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -247,14 +496,19 @@ entry: loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ] + %gep.src.2 = getelementptr inbounds nuw i32, ptr %src.2, i64 %iv + %l.src.2 = load i32, ptr %gep.src.2, align 4 + %sum.next = add i32 %sum, %l.src.2 %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv %l = load float, ptr %gep.src, align 4 - %max.next = call float @llvm.maxnum.f32(float %max, float %l) + %max.next = call float @llvm.maxnum.f32(float %l, float %max) %iv.next = add nuw nsw i64 %iv, 1 %ec = icmp eq i64 %iv.next, %n br i1 %ec, label %exit, label %loop exit: + store i32 %sum.next, ptr %src.2 ret float %max.next } diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll index 148beb64a3609..68bc8d0640a3f 100644 --- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll @@ -192,18 +192,51 @@ define float @fminnum_1(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fminnum_1( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -227,18 +260,51 @@ define float @fminnum_2(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fminnum_2( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll index 85a90f2e04c5e..e7ab02cd98a5e 100644 --- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll @@ -1001,8 +1001,10 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +; This can be vectorized with additional runtime checks for NaNs. ; CHECK-LABEL: @fmin_intrinsic_nofast( -; CHECK-NOT: <2 x float> @llvm.minnum.v2f32 +; CHECK: <2 x float> @llvm.minnum.v2f32 +; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]] define float @fmin_intrinsic_nofast(ptr nocapture readonly %x) { entry: br label %for.body @@ -1021,8 +1023,10 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +; This can be vectorized with additional runtime checks for NaNs. ; CHECK-LABEL: @fmax_intrinsic_nofast( -; CHECK-NOT: <2 x float> @llvm.maxnum.v2f32 +; CHECK: <2 x float> @llvm.maxnum.v2f32 +; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]] define float @fmax_intrinsic_nofast(ptr nocapture readonly %x) { entry: br label %for.body >From 09ddbf2ca220c4564d2e9d156bb66b8dab94ba3d Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Mon, 21 Jul 2025 08:18:01 +0200 Subject: [PATCH 2/3] Update LoopVectorize.cpp --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 74f59a2f7f136..9ca588a2d4085 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8823,8 +8823,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Apply mandatory transformation to handle FP maxnum/minnum reduction with // NaNs if possible, bail out otherwise. - if (!VPlanTransforms::runPass( - VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath, *Plan)) + if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions, + *Plan)) return nullptr; // Transform recipes to abstract recipes if it is legal and beneficial and >From 1838c311ceb4254270dba23772ac23e98e420ceb Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Mon, 21 Jul 2025 08:18:30 +0200 Subject: [PATCH 3/3] Update VPlanConstruction.cpp --- llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index c71d70935b449..a7a22e042aefc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -629,7 +629,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, } } -bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) { +bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( RedPhiR->getBackedgeValue()->getDefiningRecipe()); _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits