https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/133090
>From d0a9e1c7e89abc5890d7303a2e22a9a56e2f022b Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Wed, 26 Mar 2025 14:01:59 +0000 Subject: [PATCH 1/6] [LV] Reduce register usage for scaled reductions --- .../Transforms/Vectorize/LoopVectorize.cpp | 24 ++- .../Transforms/Vectorize/VPRecipeBuilder.h | 3 +- llvm/lib/Transforms/Vectorize/VPlan.h | 14 +- .../partial-reduce-dot-product-neon.ll | 116 ++++++++---- .../AArch64/partial-reduce-dot-product.ll | 173 ++++++------------ .../LoopVectorize/AArch64/reg-usage.ll | 6 +- 6 files changed, 171 insertions(+), 165 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2ebc7017f426a..486405991c612 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5036,10 +5036,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, // even in the scalar case. RegUsage[ClassID] += 1; } else { + // The output from scaled phis and scaled reductions actually have + // fewer lanes than the VF. + auto VF = VFs[J]; + if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R)) + VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); + else if (auto *PartialReductionR = + dyn_cast<VPPartialReductionRecipe>(R)) + VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor()); + if (VF != VFs[J]) + LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] + << " to " << VF << " for "; + R->dump();); + for (VPValue *DefV : R->definedValues()) { Type *ScalarTy = TypeInfo.inferScalarType(DefV); unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); - RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]); + RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); } } } @@ -9137,8 +9150,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands); + if (auto ScaleFactor = getScalingForReduction(Instr)) + return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; @@ -9162,7 +9175,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( VPRecipeBase * VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef<VPValue *> Operands) { + ArrayRef<VPValue *> Operands, + unsigned ScaleFactor) { assert(Operands.size() == 2 && "Unexpected number of operands for partial reduction"); @@ -9195,7 +9209,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator, - Reduction); + ScaleFactor, Reduction); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 334cfbad8bd7c..fd0064a34c4c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -178,7 +178,8 @@ class VPRecipeBuilder { /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef<VPValue *> Operands); + ArrayRef<VPValue *> Operands, + unsigned ScaleFactor); /// Set the recipe created for given ingredient. void setRecipe(Instruction *I, VPRecipeBase *R) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 37e0a176ab1cc..376526e804b4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2033,6 +2033,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + unsigned getVFScaleFactor() const { return VFScaleFactor; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2063,17 +2065,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// scalar value. class VPPartialReductionRecipe : public VPSingleDefRecipe { unsigned Opcode; + unsigned ScaleFactor; public: VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1) + VPValue *Op1, unsigned ScaleFactor) : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, - ReductionInst) {} + ScaleFactor, ReductionInst) {} VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, + unsigned ScaleFactor, Instruction *ReductionInst = nullptr) : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef<VPValue *>({Op0, Op1}), ReductionInst), - Opcode(Opcode) { + Opcode(Opcode), ScaleFactor(ScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || @@ -2084,7 +2088,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { VPPartialReductionRecipe *clone() override { return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getUnderlyingInstr()); + ScaleFactor, getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) @@ -2099,6 +2103,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { /// Get the binary op's opcode. unsigned getOpcode() const { return Opcode; } + unsigned getScaleFactor() const { return ScaleFactor; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 83226a2074315..a77447500d598 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -770,10 +770,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -782,6 +782,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -794,45 +798,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP25]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP31]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP41]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP45]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index bcdbb4d4dfbf7..69133c78811a5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -3240,10 +3240,10 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 ; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 @@ -3256,120 +3256,65 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI11:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI14:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI15:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD16]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nsw i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nsw i64 [[TMP8]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31> -; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC30:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC25]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add <4 x i32> [[TMP20]], [[VEC_PHI14]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <4 x i32> [[TMP21]], [[VEC_PHI15]] -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <4 x i8> [[STRIDED_VEC17]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC26]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP24]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <4 x i32> [[TMP26]], [[VEC_PHI12]] -; CHECK-INTERLEAVED-NEXT: [[TMP29]] = add <4 x i32> [[TMP27]], [[VEC_PHI13]] -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <4 x i8> [[STRIDED_VEC18]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC27]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <4 x i32> [[TMP32]], [[VEC_PHI10]] -; CHECK-INTERLEAVED-NEXT: [[TMP35]] = add <4 x i32> [[TMP33]], [[VEC_PHI11]] -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <4 x i8> [[STRIDED_VEC19]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <4 x i8> [[STRIDED_VEC28]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = mul nsw <4 x i32> [[TMP36]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP40]] = add <4 x i32> [[TMP38]], [[VEC_PHI8]] -; CHECK-INTERLEAVED-NEXT: [[TMP41]] = add <4 x i32> [[TMP39]], [[VEC_PHI9]] -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = sext <4 x i8> [[STRIDED_VEC20]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <4 x i8> [[STRIDED_VEC29]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP46]] = add <4 x i32> [[TMP44]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP47]] = add <4 x i32> [[TMP45]], [[VEC_PHI7]] -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <4 x i8> [[STRIDED_VEC21]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <4 x i8> [[STRIDED_VEC30]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <4 x i32> [[TMP48]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <4 x i32> [[TMP49]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP52]] = add <4 x i32> [[TMP50]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP53]] = add <4 x i32> [[TMP51]], [[VEC_PHI5]] -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <4 x i8> [[STRIDED_VEC22]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <4 x i8> [[STRIDED_VEC31]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP54]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP58]] = add <4 x i32> [[TMP56]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP59]] = add <4 x i32> [[TMP57]], [[VEC_PHI3]] -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <4 x i8> [[STRIDED_VEC23]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <4 x i8> [[STRIDED_VEC32]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <4 x i32> [[TMP62]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <4 x i32> [[TMP63]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP65]], [[TMP64]] -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX33:%.*]] = add <4 x i32> [[TMP59]], [[TMP58]] -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX33]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX34:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]] -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX34]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX35:%.*]] = add <4 x i32> [[TMP47]], [[TMP46]] -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX35]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX36:%.*]] = add <4 x i32> [[TMP41]], [[TMP40]] -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX36]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP35]], [[TMP34]] -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX37]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP29]], [[TMP28]] -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX39:%.*]] = add <4 x i32> [[TMP23]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX39]]) +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 111ff26a021ab..c1f4b922c5d97 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -45,9 +45,9 @@ define void @load_and_compare_only_used_by_assume(ptr %a, ptr noalias %b) { ; CHECK-LABEL: LV: Checking a loop in 'load_and_compare_only_used_by_assume' ; CHECK: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %loop >From a75c476e266b66c00bd25acdcae060b658c7359e Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Mon, 31 Mar 2025 15:23:46 +0100 Subject: [PATCH 2/6] Rename to getVFScaleFactor --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.h | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 486405991c612..1091e9a2bb7bc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5043,7 +5043,7 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); else if (auto *PartialReductionR = dyn_cast<VPPartialReductionRecipe>(R)) - VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor()); + VF = VF.divideCoefficientBy(PartialReductionR->getVFScaleFactor()); if (VF != VFs[J]) LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for "; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 376526e804b4b..5b7bf8eb17594 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2065,19 +2065,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// scalar value. class VPPartialReductionRecipe : public VPSingleDefRecipe { unsigned Opcode; - unsigned ScaleFactor; + unsigned VFScaleFactor; public: VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1, unsigned ScaleFactor) + VPValue *Op1, unsigned VFScaleFactor) : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, - ScaleFactor, ReductionInst) {} + VFScaleFactor, ReductionInst) {} VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - unsigned ScaleFactor, + unsigned VFScaleFactor, Instruction *ReductionInst = nullptr) : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef<VPValue *>({Op0, Op1}), ReductionInst), - Opcode(Opcode), ScaleFactor(ScaleFactor) { + Opcode(Opcode), VFScaleFactor(VFScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || @@ -2088,7 +2088,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { VPPartialReductionRecipe *clone() override { return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - ScaleFactor, getUnderlyingInstr()); + VFScaleFactor, getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) @@ -2103,7 +2103,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { /// Get the binary op's opcode. unsigned getOpcode() const { return Opcode; } - unsigned getScaleFactor() const { return ScaleFactor; } + unsigned getVFScaleFactor() const { return VFScaleFactor; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. >From 13623ddf9bde72df39586f94ed71b6b5607ee04a Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Mon, 31 Mar 2025 15:06:54 +0100 Subject: [PATCH 3/6] Put if statement inside DEBUG --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1091e9a2bb7bc..0948f23f5731a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5044,10 +5044,11 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, else if (auto *PartialReductionR = dyn_cast<VPPartialReductionRecipe>(R)) VF = VF.divideCoefficientBy(PartialReductionR->getVFScaleFactor()); - if (VF != VFs[J]) - LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] - << " to " << VF << " for "; - R->dump();); + LLVM_DEBUG(if (VF != VFs[J]) { + dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " + << VF << " for "; + R->dump(); + }); for (VPValue *DefV : R->definedValues()) { Type *ScalarTy = TypeInfo.inferScalarType(DefV); >From 6b53b421d60d0b81a017d164742bd64e14ddcf14 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Wed, 2 Apr 2025 16:54:27 +0100 Subject: [PATCH 4/6] auto -> ElementCount --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0948f23f5731a..095c7b205c7cb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5038,12 +5038,13 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, } else { // The output from scaled phis and scaled reductions actually have // fewer lanes than the VF. - auto VF = VFs[J]; + ElementCount VF = VFs[J]; if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R)) VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); else if (auto *PartialReductionR = dyn_cast<VPPartialReductionRecipe>(R)) VF = VF.divideCoefficientBy(PartialReductionR->getVFScaleFactor()); + LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for "; >From a1d86c2a3ce1b15e22abfef0a06275f1c9d217e6 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Wed, 2 Apr 2025 16:54:43 +0100 Subject: [PATCH 5/6] Add register pressure debug output checks --- .../AArch64/partial-reduce-dot-product-neon.ll | 7 +++++++ .../LoopVectorize/AArch64/partial-reduce-dot-product.ll | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index a77447500d598..8772ef5ffc0d1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -2,6 +2,7 @@ ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -947,6 +948,12 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; +; CHECK-REGS: LV: Checking a loop in 'dotp_unrolled' from <stdin> +; CHECK-REGS: LV(REG): VF = 16 +; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 69133c78811a5..532f877c2f280 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -2,6 +2,7 @@ ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -3420,6 +3421,12 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; +; CHECK-REGS-LABEL: LV: Checking a loop in 'dotp_high_register_pressure' from <stdin> +; CHECK-REGS: LV(REG): VF = 16 +; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 1 item entry: %cmp100 = icmp sgt i32 %n, 0 br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup >From c97196d7ecc0794f6633195dde4ddc6d44de03cb Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Thu, 3 Apr 2025 17:35:00 +0100 Subject: [PATCH 6/6] Add REQUIRES: asserts --- .../LoopVectorize/AArch64/partial-reduce-dot-product.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 532f877c2f280..b40c691c8f061 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 +; REQUIRES: asserts + ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits