https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/153393
>From 95b5a297b8210b9527a47bd833b97d09cca8c312 Mon Sep 17 00:00:00 2001 From: Alexey Bataev <a.bat...@outlook.com> Date: Wed, 13 Aug 2025 11:20:23 +0000 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 72 ++++++++++++++----- .../alternate-vectorization-split-node.ll | 7 +- .../X86/load-merge-inseltpoison.ll | 6 +- .../SLPVectorizer/X86/load-merge.ll | 6 +- 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3045eeb3eb48e..f71faa2e2a7d5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10571,27 +10571,29 @@ class InstructionsCompatibilityAnalysis { BasicBlock *Parent = nullptr; // Checks if the instruction has supported opcode. auto IsSupportedOpcode = [&](Instruction *I) { - return I && I->getOpcode() == Instruction::Add && + return I && + (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::LShr) && (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I)); }; // Exclude operands instructions immediately to improve compile time, it // will be unable to schedule anyway. SmallDenseSet<Value *, 8> Operands; + SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates; for (Value *V : VL) { auto *I = dyn_cast<Instruction>(V); if (!I) continue; if (!DT.isReachableFromEntry(I->getParent())) continue; - if (!MainOp) { - MainOp = I; + if (Candidates.empty()) { + Candidates.try_emplace(I->getOpcode()).first->second.push_back(I); Parent = I->getParent(); Operands.insert(I->op_begin(), I->op_end()); continue; } if (Parent == I->getParent()) { - if (!IsSupportedOpcode(MainOp) && !Operands.contains(I)) - MainOp = I; + Candidates.try_emplace(I->getOpcode()).first->second.push_back(I); Operands.insert(I->op_begin(), I->op_end()); continue; } @@ -10603,24 +10605,37 @@ class InstructionsCompatibilityAnalysis { (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"); if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) { - MainOp = I; + Candidates.clear(); + Candidates.try_emplace(I->getOpcode()).first->second.push_back(I); Parent = I->getParent(); Operands.clear(); Operands.insert(I->op_begin(), I->op_end()); } } - if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) { - MainOp = nullptr; - return; + unsigned BestOpcodeNum = 0; + MainOp = nullptr; + for (const auto &P : Candidates) { + if (P.second.size() < BestOpcodeNum) + continue; + for (Instruction *I : P.second) { + if (IsSupportedOpcode(I) && !Operands.contains(I)) { + MainOp = I; + BestOpcodeNum = P.second.size(); + break; + } + } } - MainOpcode = MainOp->getOpcode(); + if (MainOp) + MainOpcode = MainOp->getOpcode(); } /// Returns the idempotent value for the \p MainOp with the detected \p /// MainOpcode. For Add, returns 0. For Or, it should choose between false and /// the operand itself, since V or V == V. Value *selectBestIdempotentValue() const { - assert(MainOpcode == Instruction::Add && "Unsupported opcode"); + assert( + (MainOpcode == Instruction::Add || MainOpcode == Instruction::LShr) && + "Unsupported opcode"); return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(), !MainOp->isCommutative()); } @@ -10635,6 +10650,7 @@ class InstructionsCompatibilityAnalysis { return convertTo(cast<Instruction>(V), S).second; switch (MainOpcode) { case Instruction::Add: + case Instruction::LShr: return {V, selectBestIdempotentValue()}; default: break; @@ -10852,6 +10868,21 @@ class InstructionsCompatibilityAnalysis { } if (!Res) return InstructionsState::invalid(); + constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; + InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind); + InstructionCost VectorCost; + FixedVectorType *VecTy = + getWidenedType(S.getMainOp()->getType(), VL.size()); + switch (MainOpcode) { + case Instruction::Add: + case Instruction::LShr: + VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); + break; + default: + llvm_unreachable("Unexpected instruction."); + } + if (VectorCost > ScalarCost) + return InstructionsState::invalid(); return S; } assert(Operands.size() == 2 && "Unexpected number of operands!"); @@ -21064,6 +21095,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx); const auto *It = find(Op, CD->getInst()); assert(It != Op.end() && "Lane not set"); + SmallPtrSet<Instruction *, 4> Visited; do { int Lane = std::distance(Op.begin(), It); assert(Lane >= 0 && "Lane not set"); @@ -21085,13 +21117,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, (InsertInReadyList && UseSD->isReady())) WorkList.push_back(UseSD); } - } else if (ScheduleData *UseSD = getScheduleData(In)) { - CD->incDependencies(); - if (!UseSD->isScheduled()) - CD->incrementUnscheduledDeps(1); - if (!UseSD->hasValidDependencies() || - (InsertInReadyList && UseSD->isReady())) - WorkList.push_back(UseSD); + } else if (Visited.insert(In).second) { + if (ScheduleData *UseSD = getScheduleData(In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } } It = find(make_range(std::next(It), Op.end()), CD->getInst()); } while (It != Op.end()); @@ -21845,6 +21879,8 @@ bool BoUpSLP::collectValuesToDemote( return all_of(E.Scalars, [&](Value *V) { if (isa<PoisonValue>(V)) return true; + if (E.isCopyableElement(V)) + return true; auto *I = cast<Instruction>(V); KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll index 8d44d03e0e5cc..6d961fc3378b4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll @@ -8,11 +8,8 @@ define i32 @test(ptr %c) { ; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136 ; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0> -; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0> +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8> ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index 4f94784a24dd4..c02ef8388b066 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16) define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byval( ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 -; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1> +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0> ; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> ; CHECK-NEXT: ret <4 x float> [[TMP5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index 700e3ed9effc4..0545e5403f594 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16) define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byval( ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 -; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1> +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0> ; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> ; CHECK-NEXT: ret <4 x float> [[TMP5]] >From f1917c44189d9dd588d06cdffd455b0e89aea1e3 Mon Sep 17 00:00:00 2001 From: Alexey Bataev <a.bat...@outlook.com> Date: Wed, 13 Aug 2025 11:28:18 +0000 Subject: [PATCH 2/2] Fix formatting Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f71faa2e2a7d5..257733ff6ca0d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10876,7 +10876,7 @@ class InstructionsCompatibilityAnalysis { switch (MainOpcode) { case Instruction::Add: case Instruction::LShr: - VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); + VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: llvm_unreachable("Unexpected instruction."); _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits