https://github.com/skachkov-sc updated https://github.com/llvm/llvm-project/pull/140723
>From 9697cd806947ab6ebd021cb7919acd62cc2e29a0 Mon Sep 17 00:00:00 2001 From: Sergey Kachkov <[email protected]> Date: Fri, 7 Nov 2025 18:09:56 +0300 Subject: [PATCH 1/3] [VPlan] Implement compressed widening of memory instructions --- .../llvm/Analysis/TargetTransformInfo.h | 1 + .../Transforms/Vectorize/LoopVectorize.cpp | 24 ++++++++++---- llvm/lib/Transforms/Vectorize/VPlan.h | 32 ++++++++++++------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +++++++++---- .../Transforms/Vectorize/VPlanTransforms.cpp | 11 ++++--- 5 files changed, 61 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..e8769f5860c77 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1442,6 +1442,7 @@ class TargetTransformInfo { Normal, ///< The cast is used with a normal load/store. Masked, ///< The cast is used with a masked load/store. GatherScatter, ///< The cast is used with a gather/scatter. + Compressed, ///< The cast is used with an expand load/compress store. Interleave, ///< The cast is used with an interleaved load/store. Reversed, ///< The cast is used with a reversed load/store. }; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 914018591d832..25e8a63eae9cd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel { CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, + CM_Compressed, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -3108,9 +3109,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return (WideningDecision == CM_Widen || - WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Interleave); + return ( + WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Interleave || WideningDecision == CM_Compressed); }; // Returns true if Ptr is the pointer operand of a memory access instruction @@ -5191,12 +5192,17 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost( Instruction *I, ElementCount VF, InstWidening Decision) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (Decision == CM_Compressed) + return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy, + /*VariableMask*/ true, Alignment, + CostKind, I); + assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) && "Expected widen decision."); - const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, @@ -6299,6 +6305,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, switch (getWideningDecision(I, VF)) { case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Compressed: + return TTI::CastContextHint::Compressed; case LoopVectorizationCostModel::CM_Interleave: return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: @@ -7514,8 +7522,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; + bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed; bool Consecutive = - Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen; VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; if (Consecutive) { @@ -7545,11 +7554,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, } if (LoadInst *Load = dyn_cast<LoadInst>(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - VPIRMetadata(*Load, LVer), I->getDebugLoc()); + Compressed, VPIRMetadata(*Load, LVer), + I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), + Reverse, Compressed, VPIRMetadata(*Store, LVer), I->getDebugLoc()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bbb03fbdff7a2..26256951a9c6c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3193,6 +3193,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, /// Whether the consecutive accessed addresses are in reverse order. bool Reverse; + /// Whether the consecutive accessed addresses are compressed with mask value. + bool Compressed; + /// Whether the memory access is masked. bool IsMasked = false; @@ -3206,12 +3209,13 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive), - Reverse(Reverse) { + Reverse(Reverse), Compressed(Compressed) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert((Consecutive || !Compressed) && "Compressed implies consecutive"); assert(isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse && "Reversed acccess without VPVectorEndPointerRecipe address?"); @@ -3241,6 +3245,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, /// order. bool isReverse() const { return Reverse; } + /// Return whether the consecutive loaded/stored addresses are compressed. + bool isCompressed() const { return Compressed; } + /// Return the address accessed by this recipe. VPValue *getAddr() const { return getOperand(0); } @@ -3274,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Metadata, DL), + Reverse, Compressed, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, *this, - getDebugLoc()); + getMask(), Consecutive, Reverse, Compressed, + *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3316,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, - L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), + L.isCompressed(), L, L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3355,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - const VPIRMetadata &Metadata, DebugLoc DL) + bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Metadata, DL) { + Consecutive, Reverse, Compressed, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, *this, getDebugLoc()); + Reverse, Compressed, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3399,7 +3406,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S, S.getDebugLoc()) { + S.isReverse(), S.isCompressed(), S, + S.getDebugLoc()) { setMask(Mask); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112dbcd8a..0b0bd63ee2b28 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3565,8 +3565,12 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, InstructionCost Cost = 0; if (IsMasked) { - Cost += - Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); + Cost += Compressed + ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty, + /*VariableMask*/ true, + Alignment, Ctx.CostKind) + : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, + Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0) @@ -3603,9 +3607,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, "wide.masked.gather"); } else if (Mask) { - NewLI = - Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, - PoisonValue::get(DataTy), "wide.masked.load"); + NewLI = Compressed + ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.expand.load") + : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); } else { NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } @@ -3732,7 +3740,10 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { if (CreateScatter) NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); else if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + NewSI = Compressed + ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment, + Mask) + : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); else NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); applyMetadata(*NewSI); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 48bd697397f41..cdfbc531ebfa6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), - Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/, + VPIRMetadata(*Load), Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - VPIRMetadata(*Store), Ingredient.getDebugLoc()); + false /*Compressed*/, VPIRMetadata(*Store), + Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -4207,7 +4208,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) { auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); auto *L = new VPWidenLoadRecipe( *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + /*Reverse=*/false, /*Compressed=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); NarrowedOps.insert(L); return L; @@ -4344,7 +4345,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); + /*Reverse=*/false, /*Compressed=*/false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } >From e827ecc68e80a8566c444f153e639c0c81168c23 Mon Sep 17 00:00:00 2001 From: Sergey Kachkov <[email protected]> Date: Wed, 15 Jan 2025 16:09:16 +0300 Subject: [PATCH 2/3] [LoopVectorize][NFC] Add pre-commit tests --- .../LoopVectorize/compress-idioms.ll | 480 ++++++++++++++++++ 1 file changed, 480 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/compress-idioms.ll diff --git a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll new file mode 100644 index 0000000000000..1390092e40387 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll @@ -0,0 +1,480 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S 2>&1 | FileCheck %s + +define void @test_store_with_pointer(ptr writeonly %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define void @test_store_with_pointer( +; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_09]], i64 4 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[DST_ADDR_09]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[DST_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[DST_ADDR_09]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %dst.addr.09 = phi ptr [ %dst, %for.body.preheader ], [ %dst.addr.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4 + store i32 %0, ptr %dst.addr.09, align 4 + br label %for.inc + +for.inc: + %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define void @test_store_with_index(ptr writeonly %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define void @test_store_with_index( +; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[IDX_012]], 1 +; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[IDX_012]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDXPROM4]] +; CHECK-NEXT: store i32 [[TMP0]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %idx.012 = phi i32 [ 0, %for.body.preheader ], [ %idx.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %inc = add nsw i32 %idx.012, 1 + %idxprom4 = sext i32 %idx.012 to i64 + %arrayidx5 = getelementptr inbounds i32, ptr %dst, i64 %idxprom4 + store i32 %0, ptr %arrayidx5, align 4 + br label %for.inc + +for.inc: + %idx.1 = phi i32 [ %inc, %if.then ], [ %idx.012, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define void @test_load_with_pointer(ptr %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define void @test_load_with_pointer( +; CHECK-SAME: ptr [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr [[SRC_ADDR_09]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_09]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[SRC_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[SRC_ADDR_09]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %src.addr.09 = phi ptr [ %src, %for.body.preheader ], [ %src.addr.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %incdec.ptr = getelementptr inbounds i8, ptr %src.addr.09, i64 4 + %1 = load i32, ptr %src.addr.09, align 4 + store i32 %1, ptr %arrayidx, align 4 + br label %for.inc + +for.inc: + %src.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %src.addr.09, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define void @test_load_with_index(ptr %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define void @test_load_with_index( +; CHECK-SAME: ptr [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[IDX_012]], 1 +; CHECK-NEXT: [[IDXPROM2:%.*]] = sext i32 [[IDX_012]] to i64 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IDXPROM2]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %idx.012 = phi i32 [ 0, %for.body.preheader ], [ %idx.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %inc = add nsw i32 %idx.012, 1 + %idxprom2 = sext i32 %idx.012 to i64 + %arrayidx3 = getelementptr inbounds i32, ptr %src, i64 %idxprom2 + %1 = load i32, ptr %arrayidx3, align 4 + store i32 %1, ptr %arrayidx, align 4 + br label %for.inc + +for.inc: + %idx.1 = phi i32 [ %inc, %if.then ], [ %idx.012, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define void @test_store_value(ptr writeonly %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define void @test_store_value( +; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP5]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_06:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[IDX_06]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[IDX_06]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_06]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %idx.06 = phi i32 [ 0, %for.body.preheader ], [ %idx.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %inc = add nsw i32 %idx.06, 1 + %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + store i32 %idx.06, ptr %arrayidx2, align 4 + br label %for.inc + +for.inc: + %idx.1 = phi i32 [ %inc, %if.then ], [ %idx.06, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define i32 @test_multiple_uses(ptr writeonly %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define i32 @test_multiple_uses( +; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IDX_1_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[IDX_013]], 1 +; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[IDX_013]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDXPROM4]] +; CHECK-NEXT: store i32 [[TMP0]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_013]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + %idx.1.lcssa = phi i32 [ %idx.1, %for.inc ] + br label %for.cond.cleanup + +for.cond.cleanup: + %idx.0.lcssa = phi i32 [ 0, %entry ], [ %idx.1.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %idx.0.lcssa + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %idx.013 = phi i32 [ 0, %for.body.preheader ], [ %idx.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %inc = add nsw i32 %idx.013, 1 + %idxprom4 = sext i32 %idx.013 to i64 + %arrayidx5 = getelementptr inbounds i32, ptr %dst, i64 %idxprom4 + store i32 %0, ptr %arrayidx5, align 4 + br label %for.inc + +for.inc: + %idx.1 = phi i32 [ %inc, %if.then ], [ %idx.013, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define i32 @test_pre_increment(ptr writeonly %dst, ptr readonly %src, i32 %c, i32 %n) { +; CHECK-LABEL: define i32 @test_pre_increment( +; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IDX_1_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1]], %[[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[IDX_013]], 1 +; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDXPROM4]] +; CHECK-NEXT: store i32 [[TMP0]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_013]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: + %idx.1.lcssa = phi i32 [ %idx.1, %for.inc ] + br label %for.cond.cleanup + +for.cond.cleanup: + %idx.0.lcssa = phi i32 [ 0, %entry ], [ %idx.1.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %idx.0.lcssa + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %idx.013 = phi i32 [ 0, %for.body.preheader ], [ %idx.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %inc = add nsw i32 %idx.013, 1 + %idxprom4 = sext i32 %inc to i64 + %arrayidx5 = getelementptr inbounds i32, ptr %dst, i64 %idxprom4 + store i32 %0, ptr %arrayidx5, align 4 + br label %for.inc + +for.inc: + %idx.1 = phi i32 [ %inc, %if.then ], [ %idx.013, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} >From ca191beb3b8dba46c9826c546adda39680f350be Mon Sep 17 00:00:00 2001 From: Sergey Kachkov <[email protected]> Date: Thu, 27 Mar 2025 12:46:33 +0300 Subject: [PATCH 3/3] [LoopVectorize] Support vectorization of compressing patterns in VPlan --- .../Vectorize/LoopVectorizationLegality.h | 18 ++ .../Vectorize/LoopVectorizationLegality.cpp | 35 +++ .../Transforms/Vectorize/LoopVectorize.cpp | 110 +++++++- llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 + llvm/lib/Transforms/Vectorize/VPlan.h | 46 ++++ .../Transforms/Vectorize/VPlanAnalysis.cpp | 21 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 69 ++++- llvm/lib/Transforms/Vectorize/VPlanValue.h | 5 +- .../LoopVectorize/compress-idioms.ll | 248 ++++++++++++++++-- .../Transforms/Vectorize/VPlanTest.cpp | 8 +- 10 files changed, 518 insertions(+), 43 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 405d4a742f37b..a84ead26f1d9d 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -272,6 +272,10 @@ class LoopVectorizationLegality { /// induction descriptor. using InductionList = MapVector<PHINode *, InductionDescriptor>; + /// MonotonicPHIList saves monotonic phi variables and maps them to the + /// monotonic phi descriptor. + using MonotonicPHIList = MapVector<PHINode *, MonotonicDescriptor>; + /// RecurrenceSet contains the phi nodes that are recurrences other than /// inductions and reductions. using RecurrenceSet = SmallPtrSet<const PHINode *, 8>; @@ -315,6 +319,11 @@ class LoopVectorizationLegality { /// Returns the induction variables found in the loop. const InductionList &getInductionVars() const { return Inductions; } + /// Returns the monotonic phi variables found in the loop. + const MonotonicPHIList &getMonotonicPHIs() const { return MonotonicPHIs; } + + bool hasMonotonicPHIs() const { return !MonotonicPHIs.empty(); } + /// Return the fixed-order recurrences found in the loop. RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; } @@ -372,6 +381,12 @@ class LoopVectorizationLegality { /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). int isConsecutivePtr(Type *AccessTy, Value *Ptr) const; + /// Returns true if Phi is monotonic variable. + bool isMonotonicPHI(PHINode *Phi) const; + + /// Check if memory access is compressed when vectorizing. + bool isCompressedPtr(Type *AccessTy, Value *Ptr, BasicBlock *BB) const; + /// Returns true if \p V is invariant across all loop iterations according to /// SCEV. bool isInvariant(Value *V) const; @@ -677,6 +692,9 @@ class LoopVectorizationLegality { /// variables can be pointers. InductionList Inductions; + /// Holds all of the monotonic phi variables that we found in the loop. + MonotonicPHIList MonotonicPHIs; + /// Holds all the casts that participate in the update chain of the induction /// variables, and that have been proven to be redundant (possibly under a /// runtime guard). These casts can be ignored when creating the vectorized diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 03112c67dda7b..464ec496909e2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -45,6 +45,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, cl::desc("Enable recognition of non-constant strided " "pointer induction variables.")); +static cl::opt<bool> EnableMonotonicPatterns( + "lv-monotonic-patterns", cl::init(true), cl::Hidden, + cl::desc("Enable recognition of monotonic patterns.")); + static cl::opt<bool> HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, cl::desc("Allow enabling loop hints to reorder " @@ -470,6 +474,30 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, return 0; } +bool LoopVectorizationLegality::isMonotonicPHI(PHINode *Phi) const { + return MonotonicPHIs.count(Phi); +} + +bool LoopVectorizationLegality::isCompressedPtr(Type *AccessTy, Value *Ptr, + BasicBlock *BB) const { + MonotonicDescriptor Desc; + if (!MonotonicDescriptor::isMonotonicVal(Ptr, TheLoop, Desc, *PSE.getSE())) + return false; + + // Check if memory operation will use the same mask as monotonic phi. + // TODO: relax restrictions of current implementation. + if (Desc.getPredicateEdge() != + MonotonicDescriptor::Edge(BB, BB->getUniqueSuccessor())) + return false; + + // Check if pointer step equals access size. + auto *Step = + dyn_cast<SCEVConstant>(Desc.getExpr()->getStepRecurrence(*PSE.getSE())); + if (!Step) + return false; + return Step->getAPInt() == BB->getDataLayout().getTypeAllocSize(AccessTy); +} + bool LoopVectorizationLegality::isInvariant(Value *V) const { return LAI->isInvariant(V); } @@ -916,6 +944,13 @@ bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) { return true; } + MonotonicDescriptor MD; + if (EnableMonotonicPatterns && + MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, MD, *PSE.getSE())) { + MonotonicPHIs[Phi] = MD; + return true; + } + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { AllowedExit.insert(Phi); FixedOrderRecurrences.insert(Phi); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 25e8a63eae9cd..58e0b459dcd79 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1241,9 +1241,9 @@ class LoopVectorizationCostModel { getDivRemSpeculationCost(Instruction *I, ElementCount VF) const; - /// Returns widening decision (CM_Widen or CM_Widen_Reverse) if \p I is a - /// memory instruction with consecutive access that can be widened, or - /// CM_Unknown otherwise. + /// Returns widening decision (CM_Widen, CM_Widen_Reverse or CM_Compressed) if + /// \p I is a memory instruction with consecutive access that can be widened, + /// or CM_Unknown otherwise. InstWidening memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); /// Returns true if \p I is a memory instruction in an interleaved-group @@ -3000,6 +3000,9 @@ LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, auto *Ptr = getLoadStorePointerOperand(I); auto *ScalarTy = getLoadStoreType(I); + if (Legal->isCompressedPtr(ScalarTy, Ptr, I->getParent())) + return CM_Compressed; + // In order to be widened, the pointer should be consecutive, first of all. auto Stride = Legal->isConsecutivePtr(ScalarTy, Ptr); if (!Stride) @@ -3256,6 +3259,39 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { AddToWorklistIfAllowed(IndUpdate); } + // Handle monotonic phis (similarly to induction vars). + for (const auto &MonotonicPHI : Legal->getMonotonicPHIs()) { + auto *Phi = MonotonicPHI.first; + auto *PhiUpdate = cast<Instruction>(Phi->getIncomingValueForBlock(Latch)); + const auto &Desc = MonotonicPHI.second; + + auto UniformPhi = llvm::all_of(Phi->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + if (I == Desc.getStepInst()) + return true; + if (auto *PN = dyn_cast<PHINode>(I); PN && Desc.getChain().contains(PN)) + return true; + return !TheLoop->contains(I) || Worklist.count(I) || + IsVectorizedMemAccessUse(I, Phi); + }); + if (!UniformPhi) + continue; + + auto UniformPhiUpdate = + llvm::all_of(PhiUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + if (I == Phi) + return true; + return !TheLoop->contains(I) || Worklist.count(I) || + IsVectorizedMemAccessUse(I, Phi); + }); + if (!UniformPhiUpdate) + continue; + + AddToWorklistIfAllowed(Phi); + AddToWorklistIfAllowed(PhiUpdate); + } + Uniforms[VF].insert_range(Worklist); } @@ -4047,6 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPEVLBasedIVPHISC: case VPDef::VPPredInstPHISC: case VPDef::VPBranchOnMaskSC: + case VPDef::VPMonotonicPHISC: continue; case VPDef::VPReductionSC: case VPDef::VPActiveLaneMaskPHISC: @@ -4560,6 +4597,10 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, if (Plan.hasEarlyExit()) return 1; + // Monotonic vars don't support interleaving. + if (Legal->hasMonotonicPHIs()) + return 1; + const bool HasReductions = any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred<VPReductionPHIRecipe>); @@ -8074,11 +8115,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, return Recipe; VPHeaderPHIRecipe *PhiRecipe = nullptr; - assert((Legal->isReductionVariable(Phi) || + assert((Legal->isMonotonicPHI(Phi) || Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && - "can only widen reductions and fixed-order recurrences here"); + "can only widen monotonic phis, reductions and fixed-order " + "recurrences here"); VPValue *StartV = Operands[0]; - if (Legal->isReductionVariable(Phi)) { + Value *IncomingVal = + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()); + if (Legal->isMonotonicPHI(Phi)) { + const MonotonicDescriptor &Desc = + Legal->getMonotonicPHIs().find(Phi)->second; + assert(Desc.getExpr()->getStart() == PSE.getSCEV(IncomingVal)); + PhiRecipe = new VPMonotonicPHIRecipe(Phi, Desc, StartV); + } else if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi); assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); @@ -8429,6 +8478,46 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // bring the VPlan to its final state. // --------------------------------------------------------------------------- + // Adjust the recipes for any monotonic phis. + for (VPRecipeBase &R : HeaderVPBB->phis()) { + auto *MonotonicPhi = dyn_cast<VPMonotonicPHIRecipe>(&R); + if (!MonotonicPhi) + continue; + + // Prohibit scalarization of monotonic phis. + if (!all_of(Range, [&](ElementCount VF) { + return CM.isUniformAfterVectorization( + MonotonicPhi->getUnderlyingInstr(), VF); + })) + return nullptr; + + // Obtain mask value for the predicate edge from the last VPBlendRecipe in + // chain. + VPValue *Chain = MonotonicPhi->getBackedgeValue(); + VPValue *Mask = nullptr; + while (auto *BlendR = dyn_cast<VPBlendRecipe>(Chain)) + for (unsigned I = 0, E = BlendR->getNumIncomingValues(); I != E; ++I) + if (auto *IncomingVal = BlendR->getIncomingValue(I); + IncomingVal != MonotonicPhi) { + Chain = IncomingVal; + Mask = BlendR->getMask(I); + break; + } + assert(Mask); + + auto &Desc = MonotonicPhi->getDescriptor(); + auto &SE = *PSE.getSE(); + auto *Step = vputils::getOrCreateVPValueForSCEVExpr( + *Plan, Desc.getExpr()->getStepRecurrence(SE)); + + auto *MonotonicI = + new VPInstruction(VPInstruction::ComputeMonotonicResult, + {MonotonicPhi, Mask, Step}, *Desc.getStepInst()); + auto *InsertBlock = MonotonicPhi->getBackedgeRecipe().getParent(); + InsertBlock->insert(MonotonicI, InsertBlock->getFirstNonPhi()); + MonotonicPhi->getBackedgeValue()->replaceAllUsesWith(MonotonicI); + } + // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); @@ -9891,6 +9980,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); + + if (LVL.hasMonotonicPHIs() && SelectedIC > 1) { + reportVectorizationFailure( + "Interleaving of loop with monotonic vars", + "Interleaving of loops with monotonic vars is not supported", + "CantInterleaveWithMonotonicVars", ORE, L); + return false; + } + // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index dd26a059d56ad..87fd6b543c363 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -984,6 +984,7 @@ void VPlan::execute(VPTransformState *State) { auto *PhiR = cast<VPSingleDefRecipe>(&R); // VPInstructions currently model scalar Phis only. bool NeedsScalar = isa<VPInstruction>(PhiR) || + isa<VPMonotonicPHIRecipe>(PhiR) || (isa<VPReductionPHIRecipe>(PhiR) && cast<VPReductionPHIRecipe>(PhiR)->isInLoop()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 26256951a9c6c..d442b906884fa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -553,6 +553,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenIntOrFpInductionSC: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: + case VPRecipeBase::VPMonotonicPHISC: case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: @@ -1014,6 +1015,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, ComputeAnyOfResult, ComputeFindIVResult, ComputeReductionResult, + ComputeMonotonicResult, // Extracts the last lane from its operand if it is a vector, or the last // part if scalar. In the latter case, the recipe will be removed during // unrolling. @@ -2406,6 +2408,50 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, } }; +/// A recipe for handling monotonic phis. The start value is the first operand +/// of the recipe and the incoming value from the backedge is the second +/// operand. +class VPMonotonicPHIRecipe : public VPHeaderPHIRecipe { + MonotonicDescriptor Desc; + +public: + VPMonotonicPHIRecipe(PHINode *Phi, const MonotonicDescriptor &Desc, + VPValue *Start) + : VPHeaderPHIRecipe(VPDef::VPMonotonicPHISC, Phi, Start), Desc(Desc) {} + + ~VPMonotonicPHIRecipe() override = default; + + VPMonotonicPHIRecipe *clone() override { + auto *R = new VPMonotonicPHIRecipe(cast<PHINode>(getUnderlyingInstr()), + Desc, getStartValue()); + R->addOperand(getBackedgeValue()); + return R; + } + + VP_CLASSOF_IMPL(VPDef::VPMonotonicPHISC) + + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPDef::VPMonotonicPHISC; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + const MonotonicDescriptor &getDescriptor() const { return Desc; } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool usesFirstLaneOnly(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..da9e1f27310ee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -97,6 +97,11 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::ComputeReductionResult: { return inferScalarType(R->getOperand(0)); } + case VPInstruction::ComputeMonotonicResult: { + auto *PhiR = cast<VPMonotonicPHIRecipe>(R->getOperand(0)); + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + return OrigPhi->getType(); + } case VPInstruction::ExplicitVectorLength: return Type::getIntNTy(Ctx, 32); case Instruction::PHI: @@ -276,14 +281,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe()) .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe, - VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>( - [this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + VPMonotonicPHIRecipe, VPWidenPointerInductionRecipe, + VPEVLBasedIVPHIRecipe>([this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>( [](const auto *R) { return R->getScalarType(); }) .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 0b0bd63ee2b28..437e5c987a107 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -550,6 +550,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: + case VPInstruction::ComputeMonotonicResult: return 3; case VPInstruction::ComputeFindIVResult: return 4; @@ -900,6 +901,34 @@ Value *VPInstruction::generate(VPTransformState &State) { return ReducedPartRdx; } + case VPInstruction::ComputeMonotonicResult: { + assert(getParent()->getPlan()->getUF() == 1 && + "Expected unroll factor of 1."); + + auto *Phi = State.get(getOperand(0), /*IsScalar*/ true); + auto *PhiTy = Phi->getType(); + Value *Mask = State.get(getOperand(1), 0); + auto *MaskTy = Mask->getType(); + assert(isa<VectorType>(MaskTy) && + cast<VectorType>(MaskTy)->getElementType()->isIntegerTy(1) && + "Mask type should be <N x i1>"); + + const auto &DL = State.CFG.PrevBB->getDataLayout(); + auto *IntTy = PhiTy->isIntegerTy() ? PhiTy : DL.getIndexType(PhiTy); + + auto *Step = State.get(getOperand(2), /*IsScalar*/ true); + + auto &Builder = State.Builder; + auto *NumElems = Builder.CreateAddReduce( + Builder.CreateZExt(Mask, MaskTy->getWithNewType(IntTy))); + auto *Offset = Builder.CreateMul(NumElems, Step); + + return PhiTy->isPointerTy() + ? Builder.CreatePtrAdd(Phi, Offset, "monotonic.add", + getGEPNoWrapFlags()) + : Builder.CreateAdd(Phi, Offset, "monotonic.add", + hasNoUnsignedWrap(), hasNoSignedWrap()); + } case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: { @@ -1169,6 +1198,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ComputeMonotonicResult: { + Type *ElementTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); + return Ctx.TTI.getArithmeticReductionCost(Instruction::Add, VectorTy, + std::nullopt, Ctx.CostKind); + } case VPInstruction::ExtractLastElement: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); @@ -1198,6 +1233,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::ComputeMonotonicResult || getOpcode() == VPInstruction::AnyOf; } @@ -1421,6 +1457,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ComputeReductionResult: O << "compute-reduction-result"; break; + case VPInstruction::ComputeMonotonicResult: + O << "compute-monotonic-result"; + break; case VPInstruction::LogicalAnd: O << "logical-and"; break; @@ -2043,7 +2082,9 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { case OperationType::OverflowingBinOp: return Opcode == Instruction::Add || Opcode == Instruction::Sub || Opcode == Instruction::Mul || - Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; + Opcode == + VPInstruction::VPInstruction::CanonicalIVIncrementForPart || + Opcode == VPInstruction::ComputeMonotonicResult; case OperationType::Trunc: return Opcode == Instruction::Trunc; case OperationType::DisjointOp: @@ -2053,7 +2094,8 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { case OperationType::GEPOp: return Opcode == Instruction::GetElementPtr || Opcode == VPInstruction::PtrAdd || - Opcode == VPInstruction::WidePtrAdd; + Opcode == VPInstruction::WidePtrAdd || + Opcode == VPInstruction::ComputeMonotonicResult; case OperationType::FPMathOp: return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || Opcode == Instruction::FSub || Opcode == Instruction::FNeg || @@ -4453,6 +4495,29 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPMonotonicPHIRecipe::execute(VPTransformState &State) { + assert(getParent()->getPlan()->getUF() == 1 && "Expected unroll factor 1."); + Value *Start = getStartValue()->getLiveInIRValue(); + BasicBlock *VectorPH = + State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); + PHINode *MonotonicPHI = + State.Builder.CreatePHI(Start->getType(), 2, "monotonic.iv"); + MonotonicPHI->addIncoming(Start, VectorPH); + MonotonicPHI->setDebugLoc(getDebugLoc()); + State.set(this, MonotonicPHI, /*IsScalar=*/true); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPMonotonicPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "MONOTONIC-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + void VPWidenPHIRecipe::execute(VPTransformState &State) { Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5da74630ef626..4fd2398c64999 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -371,12 +371,13 @@ class VPDef { VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, VPReductionPHISC, + VPMonotonicPHISC, // END: SubclassID for recipes that inherit VPHeaderPHIRecipe // END: Phi-like recipes VPFirstPHISC = VPWidenPHISC, VPFirstHeaderPHISC = VPCanonicalIVPHISC, - VPLastHeaderPHISC = VPReductionPHISC, - VPLastPHISC = VPReductionPHISC, + VPLastHeaderPHISC = VPMonotonicPHISC, + VPLastPHISC = VPMonotonicPHISC, }; VPDef(const unsigned char SC) : SubclassID(SC) {} diff --git a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll index 1390092e40387..7bef1db8f7cdc 100644 --- a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll +++ b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll @@ -5,18 +5,52 @@ define void @test_store_with_pointer(ptr writeonly %dst, ptr readonly %src, i32 ; CHECK-LABEL: define void @test_store_with_pointer( ; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[DST1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP12]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi ptr [ [[DST]], %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP11]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP4]], <4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[MONOTONIC_ADD]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi ptr [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -29,7 +63,7 @@ define void @test_store_with_pointer(ptr writeonly %dst, ptr readonly %src, i32 ; CHECK-NEXT: [[DST_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[DST_ADDR_09]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %cmp8 = icmp sgt i32 %n, 0 @@ -69,18 +103,54 @@ define void @test_store_with_index(ptr writeonly %dst, ptr readonly %src, i32 %c ; CHECK-LABEL: define void @test_store_with_index( ; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[DST1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[MONOTONIC_IV]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP12]], <4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 1 +; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -95,7 +165,7 @@ define void @test_store_with_index(ptr writeonly %dst, ptr readonly %src, i32 %c ; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -141,14 +211,51 @@ define void @test_load_with_pointer(ptr %dst, ptr readonly %src, i32 %c, i32 %n) ; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP28]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP27]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[WIDE_MASKED_EXPAND_LOAD:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 [[MONOTONIC_IV]], <4 x i1> [[TMP3]], <4 x i32> poison), !alias.scope [[META9]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_EXPAND_LOAD]], ptr align 4 [[TMP27]], <4 x i1> [[TMP3]]), !alias.scope [[META6]], !noalias [[META9]] +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP21]]) +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24]] = getelementptr inbounds i8, ptr [[MONOTONIC_IV]], i64 [[TMP23]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -162,7 +269,7 @@ define void @test_load_with_pointer(ptr %dst, ptr readonly %src, i32 %c, i32 %n) ; CHECK-NEXT: [[SRC_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[SRC_ADDR_09]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; entry: %cmp8 = icmp sgt i32 %n, 0 @@ -207,14 +314,53 @@ define void @test_load_with_index(ptr %dst, ptr readonly %src, i32 %c, i32 %n) { ; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP36]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[MONOTONIC_IV]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP11]] +; CHECK-NEXT: [[WIDE_MASKED_EXPAND_LOAD:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 [[TMP12]], <4 x i1> [[TMP3]], <4 x i32> poison), !alias.scope [[META16]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_EXPAND_LOAD]], ptr align 4 [[TMP36]], <4 x i1> [[TMP3]]), !alias.scope [[META13]], !noalias [[META16]] +; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP29]]) +; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 1 +; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP32]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -230,7 +376,7 @@ define void @test_load_with_index(ptr %dst, ptr readonly %src, i32 %c, i32 %n) { ; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -339,20 +485,56 @@ define i32 @test_multiple_uses(ptr writeonly %dst, ptr readonly %src, i32 %c, i3 ; CHECK-LABEL: define i32 @test_multiple_uses( ; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: br i1 [[CMP12]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[DST1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP13]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[MONOTONIC_IV]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]] +; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP6]], <4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 1 +; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP11]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: -; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ], [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IDX_1_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ] -; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -367,7 +549,7 @@ define i32 @test_multiple_uses(ptr writeonly %dst, ptr readonly %src, i32 %c, i3 ; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_013]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; entry: %cmp12 = icmp sgt i32 %n, 0 @@ -478,3 +660,27 @@ for.inc: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK: [[META9]] = !{[[META10:![0-9]+]]} +; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; CHECK: [[META13]] = !{[[META14:![0-9]+]]} +; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} +; CHECK: [[META15]] = distinct !{[[META15]], !"LVerDomain"} +; CHECK: [[META16]] = !{[[META17:![0-9]+]]} +; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]} +;. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index c1791dfa5b761..dba607a5061a7 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1132,7 +1132,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); + VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {}, {}); EXPECT_TRUE(isa<VPUser>(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa<VPUser>(BaseR)); @@ -1249,7 +1249,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); + VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {}, {}); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1263,8 +1263,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {}, - {}); + VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false, + {}, {}); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory()); _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
