https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/168820

From e673cdaee95d870dd5e2fa13ab064f6dbd0ba273 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <[email protected]>
Date: Wed, 19 Nov 2025 18:00:32 -0800
Subject: [PATCH] VectorCombine: Improve the insert/extract fold in the
 narrowing case

Keeping the extracted element in a natural position in the narrowed
vector has two beneficial effects:

1. It makes the narrowing shuffles cheaper (at least on AMDGPU), which
   allows the insert/extract fold to trigger.
2. It makes the narrowing shuffles in a chain of extract/insert
   compatible, which allows foldLengthChangingShuffles to successfully
   recognize a chain that can be folded.

There are minor X86 test changes that look reasonable to me. The IR
change for AVX2 in 
llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
doesn't change the assembly generated by `llc -mtriple=x86_64-- -mattr=AVX2`
at all.

commit-id:c151bb04
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 22 +++++--------------
 .../VectorCombine/AMDGPU/extract-insert-i8.ll | 17 ++------------
 .../X86/extract-insert-poison.ll              | 12 ++++++----
 .../VectorCombine/X86/extract-insert.ll       |  8 +++----
 .../Transforms/VectorCombine/X86/pr126085.ll  |  4 ++--
 5 files changed, 22 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp 
b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fc39f4123fac4..9025b93f75458 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -4455,22 +4455,15 @@ bool 
VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
 
   bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
-  bool IsExtIdxInBounds = ExtIdx < NumDstElts;
   bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
   if (NeedDstSrcSwap) {
     SK = TargetTransformInfo::SK_PermuteSingleSrc;
-    if (!IsExtIdxInBounds && NeedExpOrNarrow)
-      Mask[InsIdx] = 0;
-    else
-      Mask[InsIdx] = ExtIdx;
+    Mask[InsIdx] = ExtIdx % NumDstElts;
     std::swap(DstVec, SrcVec);
   } else {
     SK = TargetTransformInfo::SK_PermuteTwoSrc;
     std::iota(Mask.begin(), Mask.end(), 0);
-    if (!IsExtIdxInBounds && NeedExpOrNarrow)
-      Mask[InsIdx] = NumDstElts;
-    else
-      Mask[InsIdx] = ExtIdx + NumDstElts;
+    Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
   }
 
   // Cost
@@ -4491,14 +4484,11 @@ bool 
VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
       NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
                                     nullptr, {DstVec, SrcVec});
   } else {
-    // When creating length-changing-vector, always create with a Mask whose
-    // first element has an ExtIdx, so that the first element of the vector
-    // being created is always the target to be extracted.
+    // When creating a length-changing-vector, always try to keep the relevant
+    // element in an equivalent position, so that bulk shuffles are more likely
+    // to be useful.
     ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
-    if (IsExtIdxInBounds)
-      ExtToVecMask[ExtIdx] = ExtIdx;
-    else
-      ExtToVecMask[0] = ExtIdx;
+    ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
     // Add cost for expanding or narrowing
     NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
                                  DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll 
b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
index eaab7199a3cf3..442a93689a791 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
@@ -91,21 +91,8 @@ entry:
 define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) {
 ; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening(
 ; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17
-; OPT-NEXT:    [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18
-; OPT-NEXT:    [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19
-; OPT-NEXT:    [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21
-; OPT-NEXT:    [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22
-; OPT-NEXT:    [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23
-; OPT-NEXT:    [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, 
<8 x i32> <i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, 
i32 poison, i32 poison>
-; OPT-NEXT:    [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1
-; OPT-NEXT:    [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2
-; OPT-NEXT:    [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> 
poison, <8 x i32> <i32 20, i32 poison, i32 poison, i32 poison, i32 poison, i32 
poison, i32 poison, i32 poison>
-; OPT-NEXT:    [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> 
[[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5
-; OPT-NEXT:    [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6
-; OPT-NEXT:    [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7
+; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> 
poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 
23>
+; OPT-NEXT:    [[O_7:%.*]] = shufflevector <8 x i8> poison, <8 x i8> [[TMP1]], 
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; OPT-NEXT:    ret <8 x i8> [[O_7]]
 ;
   %i.0 = extractelement <32 x i8> %in, i64 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll 
b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
index e85c092b1b213..228f161698bb2 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
@@ -140,10 +140,14 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x 
double> %a, <4 x double> %b)
 }
 
 define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> 
%b) {
-; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> poison, double 
[[EXT]], i32 0
-; CHECK-NEXT:    ret <2 x double> [[INS]]
+; SSE-LABEL: @src_ins0_v2f64_ext3_v4f64(
+; SSE-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x 
double> poison, <2 x i32> <i32 3, i32 poison>
+; SSE-NEXT:    ret <2 x double> [[INS]]
+;
+; AVX-LABEL: @src_ins0_v2f64_ext3_v4f64(
+; AVX-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
+; AVX-NEXT:    [[INS:%.*]] = insertelement <2 x double> poison, double 
[[EXT]], i32 0
+; AVX-NEXT:    ret <2 x double> [[INS]]
 ;
   %ext = extractelement <4 x double> %b, i32 3
   %ins = insertelement <2 x double> poison, double %ext, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll 
b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
index 193ad36616a4a..e591ea55a453d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
@@ -136,8 +136,8 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> 
%a, <4 x double> %b)
 
 define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> 
%b) {
 ; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x 
double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x 
double> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x 
double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x 
double> [[TMP1]], <2 x i32> <i32 3, i32 1>
 ; CHECK-NEXT:    ret <2 x double> [[INS]]
 ;
   %ext = extractelement <4 x double> %b, i32 3
@@ -185,8 +185,8 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> 
%a, <4 x double> %b)
 
 define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> 
%b) {
 ; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x 
double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x 
double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x 
double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x 
double> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    ret <2 x double> [[INS]]
 ;
   %ext = extractelement <4 x double> %b, i32 3
diff --git a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll 
b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
index f596807027db6..d29cdb3d95c81 100644
--- a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
@@ -6,8 +6,8 @@ define i32 @test(ptr %a0) {
 ; CHECK-SAME: ptr [[A0:%.*]]) {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[A0]], align 1
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> 
poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i8> [[LOAD]], i64 11
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i8> [[SHUF]], i8 [[ELT]], 
i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> 
poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 11>
+; CHECK-NEXT:    [[INS:%.*]] = shufflevector <4 x i8> [[SHUF]], <4 x i8> 
[[TMP1]], <4 x i32> <i32 0, i32 7, i32 2, i32 3>
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <4 x i8> [[INS]] to i32
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to