https://github.com/alexey-bataev updated 
https://github.com/llvm/llvm-project/pull/153393

>From 95b5a297b8210b9527a47bd833b97d09cca8c312 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bat...@outlook.com>
Date: Wed, 13 Aug 2025 11:20:23 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 72 ++++++++++++++-----
 .../alternate-vectorization-split-node.ll     |  7 +-
 .../X86/load-merge-inseltpoison.ll            |  6 +-
 .../SLPVectorizer/X86/load-merge.ll           |  6 +-
 4 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp 
b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3045eeb3eb48e..f71faa2e2a7d5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10571,27 +10571,29 @@ class InstructionsCompatibilityAnalysis {
     BasicBlock *Parent = nullptr;
     // Checks if the instruction has supported opcode.
     auto IsSupportedOpcode = [&](Instruction *I) {
-      return I && I->getOpcode() == Instruction::Add &&
+      return I &&
+             (I->getOpcode() == Instruction::Add ||
+              I->getOpcode() == Instruction::LShr) &&
              (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
     };
     // Exclude operands instructions immediately to improve compile time, it
     // will be unable to schedule anyway.
     SmallDenseSet<Value *, 8> Operands;
+    SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
     for (Value *V : VL) {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
       if (!DT.isReachableFromEntry(I->getParent()))
         continue;
-      if (!MainOp) {
-        MainOp = I;
+      if (Candidates.empty()) {
+        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
         Parent = I->getParent();
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
       if (Parent == I->getParent()) {
-        if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
-          MainOp = I;
+        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
@@ -10603,24 +10605,37 @@ class InstructionsCompatibilityAnalysis {
                  (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
              "Different nodes should have different DFS numbers");
       if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
-        MainOp = I;
+        Candidates.clear();
+        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
         Parent = I->getParent();
         Operands.clear();
         Operands.insert(I->op_begin(), I->op_end());
       }
     }
-    if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
-      MainOp = nullptr;
-      return;
+    unsigned BestOpcodeNum = 0;
+    MainOp = nullptr;
+    for (const auto &P : Candidates) {
+      if (P.second.size() < BestOpcodeNum)
+        continue;
+      for (Instruction *I : P.second) {
+        if (IsSupportedOpcode(I) && !Operands.contains(I)) {
+          MainOp = I;
+          BestOpcodeNum = P.second.size();
+          break;
+        }
+      }
     }
-    MainOpcode = MainOp->getOpcode();
+    if (MainOp)
+      MainOpcode = MainOp->getOpcode();
   }
 
   /// Returns the idempotent value for the \p MainOp with the detected \p
   /// MainOpcode. For Add, returns 0. For Or, it should choose between false 
and
   /// the operand itself, since V or V == V.
   Value *selectBestIdempotentValue() const {
-    assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+    assert(
+        (MainOpcode == Instruction::Add || MainOpcode == Instruction::LShr) &&
+        "Unsupported opcode");
     return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
                                           !MainOp->isCommutative());
   }
@@ -10635,6 +10650,7 @@ class InstructionsCompatibilityAnalysis {
       return convertTo(cast<Instruction>(V), S).second;
     switch (MainOpcode) {
     case Instruction::Add:
+    case Instruction::LShr:
       return {V, selectBestIdempotentValue()};
     default:
       break;
@@ -10852,6 +10868,21 @@ class InstructionsCompatibilityAnalysis {
       }
       if (!Res)
         return InstructionsState::invalid();
+      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+      InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
+      InstructionCost VectorCost;
+      FixedVectorType *VecTy =
+          getWidenedType(S.getMainOp()->getType(), VL.size());
+      switch (MainOpcode) {
+      case Instruction::Add:
+      case Instruction::LShr:
+      VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
+        break;
+      default:
+        llvm_unreachable("Unexpected instruction.");
+      }
+      if (VectorCost > ScalarCost)
+        return InstructionsState::invalid();
       return S;
     }
     assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21064,6 +21095,7 @@ void 
BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
         ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
         const auto *It = find(Op, CD->getInst());
         assert(It != Op.end() && "Lane not set");
+        SmallPtrSet<Instruction *, 4> Visited;
         do {
           int Lane = std::distance(Op.begin(), It);
           assert(Lane >= 0 && "Lane not set");
@@ -21085,13 +21117,15 @@ void 
BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
                   (InsertInReadyList && UseSD->isReady()))
                 WorkList.push_back(UseSD);
             }
-          } else if (ScheduleData *UseSD = getScheduleData(In)) {
-            CD->incDependencies();
-            if (!UseSD->isScheduled())
-              CD->incrementUnscheduledDeps(1);
-            if (!UseSD->hasValidDependencies() ||
-                (InsertInReadyList && UseSD->isReady()))
-              WorkList.push_back(UseSD);
+          } else if (Visited.insert(In).second) {
+            if (ScheduleData *UseSD = getScheduleData(In)) {
+              CD->incDependencies();
+              if (!UseSD->isScheduled())
+                CD->incrementUnscheduledDeps(1);
+              if (!UseSD->hasValidDependencies() ||
+                  (InsertInReadyList && UseSD->isReady()))
+                WorkList.push_back(UseSD);
+            }
           }
           It = find(make_range(std::next(It), Op.end()), CD->getInst());
         } while (It != Op.end());
@@ -21845,6 +21879,8 @@ bool BoUpSLP::collectValuesToDemote(
       return all_of(E.Scalars, [&](Value *V) {
         if (isa<PoisonValue>(V))
           return true;
+        if (E.isCopyableElement(V))
+          return true;
         auto *I = cast<Instruction>(V);
         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
         APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
diff --git 
a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
 
b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
index 8d44d03e0e5cc..6d961fc3378b4 100644
--- 
a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
+++ 
b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -8,11 +8,8 @@ define i32 @test(ptr %c) {
 ; CHECK-NEXT:    [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
 ; CHECK-NEXT:    [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> 
poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> 
poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 
0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> 
poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 
poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> 
[[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> 
poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 4f94784a24dd4..c02ef8388b066 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture 
readonly dereferenceable(16)
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) 
%x) {
 ; CHECK-LABEL: @PR16739_byval(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
-; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> 
poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], 
i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> 
poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 
0, i64 0>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 700e3ed9effc4..0545e5403f594 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture 
readonly dereferenceable(16)
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) 
%x) {
 ; CHECK-LABEL: @PR16739_byval(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
-; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> 
poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], 
i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> 
poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 
0, i64 0>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]

>From f1917c44189d9dd588d06cdffd455b0e89aea1e3 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bat...@outlook.com>
Date: Wed, 13 Aug 2025 11:28:18 +0000
Subject: [PATCH 2/2] Fix formatting

Created using spr 1.3.5
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp 
b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f71faa2e2a7d5..257733ff6ca0d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10876,7 +10876,7 @@ class InstructionsCompatibilityAnalysis {
       switch (MainOpcode) {
       case Instruction::Add:
       case Instruction::LShr:
-      VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
+        VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
         llvm_unreachable("Unexpected instruction.");

_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to