llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-vectorizers Author: Sander de Smalen (sdesmalen-arm) <details> <summary>Changes</summary> We want the LV cost-model to make the best possible decision of VF and whether or not to use partial reductions. At the moment, when the LV can use partial reductions for a given VF range, it assumes those are always preferred. After transforming the plan to use partial reductions, it then chooses the most profitable VF. It is possible for a different VF to have been more profitable, if it wouldn't have chosen to use partial reductions. This PR changes that, to first decide whether partial reductions are more profitable for a given chain. If not, then it won't do the transform. This causes some regressions for AArch64 which are addressed in a follow-up PR to keep this one simple. --- Patch is 71.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/181706.diff 6 Files Affected: - (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+92-80) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-add-sdot-i16-i32.ll (+4) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+40-44) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll (+33-33) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll (+18-18) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll (+32-45) ``````````diff diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 12606ab9f6cd4..1e4bbe74ebbe9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4285,23 +4285,19 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); - if (Red->isPartialReduction()) { - TargetTransformInfo::PartialReductionExtendKind ExtKind = - TargetTransformInfo::getPartialReductionExtendKind(ExtOpc); - // FIXME: Move partial reduction creation, costing and clamping - // here from LoopVectorize.cpp. - ExtRedCost = Ctx.TTI.getPartialReductionCost( - Opcode, SrcTy, nullptr, RedTy, VF, ExtKind, - llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind, - RedTy->isFloatingPointTy() - ? std::optional{Red->getFastMathFlags()} - : std::nullopt); - } else if (!RedTy->isFloatingPointTy()) { - // TTI::getExtendedReductionCost only supports integer types. - ExtRedCost = Ctx.TTI.getExtendedReductionCost( - Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy, - Red->getFastMathFlags(), CostKind); - } + // For partial reductions, the decision has already been + // made at the point of transforming reductions -> partial + // reductions for a given plan, based on the cost-model. + if (Red->isPartialReduction()) + return true; + + // TTI::getExtendedReductionCost for in-loop reductions + // only supports integer types. + if (RedTy->isFloatingPointTy()) + return false; + ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy, + Red->getFastMathFlags(), CostKind); return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; }, Range); @@ -4351,37 +4347,24 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; InstructionCost MulAccCost; - if (Red->isPartialReduction()) { - Type *SrcTy2 = - Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr; - // FIXME: Move partial reduction creation, costing and clamping - // here from LoopVectorize.cpp. - MulAccCost = Ctx.TTI.getPartialReductionCost( - Opcode, SrcTy, SrcTy2, RedTy, VF, - Ext0 ? TargetTransformInfo::getPartialReductionExtendKind( - Ext0->getOpcode()) - : TargetTransformInfo::PR_None, - Ext1 ? TargetTransformInfo::getPartialReductionExtendKind( - Ext1->getOpcode()) - : TargetTransformInfo::PR_None, - Mul->getOpcode(), CostKind, - RedTy->isFloatingPointTy() - ? std::optional{Red->getFastMathFlags()} - : std::nullopt); - } else { - // Only partial reductions support mixed or floating-point extends - // at the moment. - if (Ext0 && Ext1 && - (Ext0->getOpcode() != Ext1->getOpcode() || - Ext0->getOpcode() == Instruction::CastOps::FPExt)) - return false; + // For partial reductions, the decision has already been + // made at the point of transforming reductions -> partial + // reductions for a given plan, based on the cost-model. + if (Red->isPartialReduction()) + return true; + + // Only partial reductions support mixed or floating-point extends + // at the moment. + if (Ext0 && Ext1 && + (Ext0->getOpcode() != Ext1->getOpcode() || + Ext0->getOpcode() == Instruction::CastOps::FPExt)) + return false; - bool IsZExt = - !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt; - auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); - MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy, - SrcVecTy, CostKind); - } + bool IsZExt = + !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt; + auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); + MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy, + SrcVecTy, CostKind); InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); @@ -5876,12 +5859,11 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; }); } -/// Check if a partial reduction chain is is supported by the target (i.e. does -/// not have an invalid cost) for the given VF range. Clamps the range and -/// returns true if profitable for any VF. -static bool isValidPartialReduction(const VPPartialReductionChain &Chain, - Type *PhiType, VPCostContext &CostCtx, - VFRange &Range) { +/// Returns the cost of a link in a partial-reduction chain for a given VF. +static InstructionCost +getPartialReductionLinkCost(VPCostContext &CostCtx, + const VPPartialReductionChain &Chain, + ElementCount VF) { auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext) -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> { if (!Ext) @@ -5891,44 +5873,39 @@ static bool isValidPartialReduction(const VPPartialReductionChain &Chain, static_cast<Instruction::CastOps>(Ext->getOpcode())); return {ExtOpType, ExtKind}; }; - auto ExtInfoA = GetExtInfo(Chain.ExtendA); - auto ExtInfoB = GetExtInfo(Chain.ExtendB); - Type *ExtOpTypeA = ExtInfoA.first; - Type *ExtOpTypeB = ExtInfoB.first; - auto ExtKindA = ExtInfoA.second; - auto ExtKindB = ExtInfoB.second; + auto [ExtOpTypeA, ExtKindA] = GetExtInfo(Chain.ExtendA); + auto [ExtOpTypeB, ExtKindB] = GetExtInfo(Chain.ExtendB); + + std::optional<unsigned> BinOpc = + (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) + ? std::optional{Chain.BinOp->getOpcode()} + : std::nullopt; // If ExtendB is nullptr but there's a separate BinOp, the second operand // was a constant that can use the same extend kind as the first. - if (!Chain.ExtendB && Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) { + if (!Chain.ExtendB && BinOpc) { const APInt *Const = nullptr; for (VPValue *Op : Chain.BinOp->operands()) { if (match(Op, m_APInt(Const))) break; } if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA)) - return false; + return InstructionCost::getInvalid(); ExtOpTypeB = ExtOpTypeA; ExtKindB = ExtKindA; } - std::optional<unsigned> BinOpc = - (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) - ? std::make_optional(Chain.BinOp->getOpcode()) + Type *RdxType = CostCtx.Types.inferScalarType(Chain.ReductionBinOp); + std::optional<llvm::FastMathFlags> Flags = + RdxType->isFloatingPointTy() + ? std::optional{Chain.ReductionBinOp->getFastMathFlags()} : std::nullopt; - VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp; - return LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - return CostCtx.TTI - .getPartialReductionCost( - WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF, - ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind, - PhiType->isFloatingPointTy() - ? std::optional{WidenRecipe->getFastMathFlags()} - : std::nullopt) - .isValid(); - }, - Range); + unsigned Opcode = Chain.RK == RecurKind::Sub + ? Instruction::Add + : Chain.ReductionBinOp->getOpcode(); + return CostCtx.TTI.getPartialReductionCost(Opcode, ExtOpTypeA, ExtOpTypeB, + RdxType, VF, ExtKindA, ExtKindB, + BinOpc, CostCtx.CostKind, Flags); } /// Examines reduction operations to see if the target can use a cheaper @@ -5985,8 +5962,7 @@ getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPValue *PrevValue, assert(Operands.size() <= 2 && "expected at most 2 operands"); for (const auto &[I, OpVal] : enumerate(Operands)) { - // Allow constant as second operand - validation happens in - // isValidPartialReduction. + // Allow constant as second operand - validation happens later. const APInt *Unused; if (I > 0 && CastRecipes[0] && match(OpVal, m_APInt(Unused))) continue; @@ -6048,7 +6024,15 @@ getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPValue *PrevValue, VPPartialReductionChain Chain( {UpdateR, CastRecipes[0], CastRecipes[1], BinOp, static_cast<unsigned>(PHISize.getKnownScalarFactor(ASize)), RK}); - if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range)) + + /// Check if a partial reduction chain is supported by the target (i.e. + /// does not have an invalid cost) for the given VF range. Clamps the range + /// and returns true if feasible for any VF. + if (!LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + return getPartialReductionLinkCost(CostCtx, Chain, VF).isValid(); + }, + Range)) return false; Chains.push_back(Chain); @@ -6104,6 +6088,26 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan, }); }; + auto IsProfitablePartialReductionChainForVF = + [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool { + InstructionCost PartialCost = 0, RegularCost = 0; + + // The chain is a profitable partial reduction chain if + // the cost of handling the entire chain is cheaper when + // using partial reductions than when handling the entire + // chain using regular reductions. + for (const VPPartialReductionChain &Link : Chain) { + PartialCost += getPartialReductionLinkCost(CostCtx, Link, VF); + RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx); + RegularCost += Link.BinOp && Link.BinOp != Link.ReductionBinOp + ? Link.BinOp->computeCost(VF, CostCtx) + : 0; + RegularCost += Link.ExtendA ? Link.ExtendA->computeCost(VF, CostCtx) : 0; + RegularCost += Link.ExtendB ? Link.ExtendB->computeCost(VF, CostCtx) : 0; + } + return PartialCost.isValid() && PartialCost <= RegularCost; + }; + // Validate chains: check that extends are only used by partial reductions, // and that reduction bin ops are only used by other partial reductions with // matching scale factors, are outside the loop region or the select @@ -6143,6 +6147,14 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan, } } } + + // Clear the chain if it is not profitable. + if (!LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + return IsProfitablePartialReductionChainForVF(Chains, VF); + }, + Range)) + Chains.clear(); } for (auto &[Phi, Chains] : ChainsByPhi) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-add-sdot-i16-i32.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-add-sdot-i16-i32.ll index 02afd113d3efa..294846c7290a0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-add-sdot-i16-i32.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-add-sdot-i16-i32.ll @@ -16,6 +16,10 @@ ; RUN: -mattr=+sve,+sme2 -scalable-vectorization=on \ ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCALABLE +; FIXME: This test currently fails because the cost-model deems the cost of a partial reduction +; of i16 -> i32 too high, such that the LV doesn't consider it profitable to use partial reductions. +; XFAIL: * + ; LV: Checking a loop in 'sext_reduction_i16_to_i32' ; CHECK-FIXED-BASE: Cost of 3 for VF 8: EXPRESSION vp<%8> = ir<%acc> + partial.reduce.add (ir<%load> sext to i32) ; CHECK-FIXED: Cost of 1 for VF 8: EXPRESSION vp<%8> = ir<%acc> + partial.reduce.add (ir<%load> sext to i32) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index d1fde2cdaafe1..32d04694c693a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -107,7 +107,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] @@ -116,17 +116,16 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP10]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]] -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP16]] = sub <vscale x 8 x i32> [[TMP12]], [[TMP17]] ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP16]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE-MAXBW: scalar.ph: @@ -193,7 +192,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NEON: middle.block: ; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] @@ -234,7 +233,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP11]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-SVE: middle.block: ; CHECK-SVE-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] @@ -275,7 +274,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] @@ -345,7 +344,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-NEON: middle.block: ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] @@ -387,7 +386,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP12]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VE... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/181706 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
