[llvm-branch-commits] [llvm] release/21.x: [VPlan] Don't narrow op multiple times in narrowInterle… (PR #158013)

Florian Hahn via llvm-branch-commits Fri, 12 Sep 2025 18:52:42 -0700

https://github.com/fhahn created 
https://github.com/llvm/llvm-project/pull/158013


…aveGroups.

Track which ops already have been narrowed, to avoid narrowing the same 
operation multiple times. Repeated narrowing will lead to incorrect results, 
because we could first narrow from an interleave group -> wide load, and then 
narrow the wide load > single-scalar load.

Fixes thttps://github.com/llvm/llvm-project/issues/156190.

>From 93505953fea754e6bbb1edb5fca75097132377b5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <f...@fhahn.com>
Date: Wed, 10 Sep 2025 17:09:49 +0100
Subject: [PATCH] release/21.x: [VPlan] Don't narrow op multiple times in
 narrowInterleaveGroups.

Track which ops already have been narrowed, to avoid narrowing the same
operation multiple times. Repeated narrowing will lead to incorrect
results, because we could first narrow from an interleave group -> wide
load, and then narrow the wide load > single-scalar load.

Fixes thttps://github.com/llvm/llvm-project/issues/156190.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  8 +-
 ...nterleave-to-widen-memory-with-wide-ops.ll | 79 +++++++++++++++++++
 ...sform-narrow-interleave-to-widen-memory.ll | 73 +++++++++++++++++
 3 files changed, 158 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp 
b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6a3b3e6e41955..f7c1c10185c68 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3252,9 +3252,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan 
&Plan, ElementCount VF,
     return;
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
-  auto NarrowOp = [](VPValue *V) -> VPValue * {
+  SmallPtrSet<VPValue *, 4> NarrowedOps;
+  auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
     auto *R = V->getDefiningRecipe();
-    if (!R)
+    if (!R || NarrowedOps.contains(V))
       return V;
     if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
       // Narrow interleave group to wide load, as transformed VPlan will only
@@ -3264,6 +3265,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, 
ElementCount VF,
           LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
           /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
       L->insertBefore(LoadGroup);
+      NarrowedOps.insert(L);
       return L;
     }
 
@@ -3271,6 +3273,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, 
ElementCount VF,
       assert(RepR->isSingleScalar() &&
              isa<LoadInst>(RepR->getUnderlyingInstr()) &&
              "must be a single scalar load");
+      NarrowedOps.insert(RepR);
       return RepR;
     }
     auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3281,6 +3284,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, 
ElementCount VF,
                                     WideLoad->operands(), /*IsUniform*/ true,
                                     /*Mask*/ nullptr, *WideLoad);
     N->insertBefore(WideLoad);
+    NarrowedOps.insert(N);
     return N;
   };
 
diff --git 
a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
 
b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 813d61b52100f..aec6c0be6dde2 100644
--- 
a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ 
b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -1203,3 +1203,82 @@ loop:
 exit:
   ret void
 }
+
+; Make sure multiple uses of a narrowed op are handled correctly,
+; https://github.com/llvm/llvm-project/issues/156190.
+define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, 
ptr noalias %B, ptr noalias %C) {
+; VF2-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
+; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias 
[[C:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 
[[INDEX]]
+; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x double>, ptr [[TMP0]], 
align 8
+; VF2-NEXT:    [[TMP2:%.*]] = fadd contract <2 x double> [[BROADCAST_SPLAT]], 
splat (double 2.000000e+01)
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 
[[INDEX]]
+; VF2-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr { double, double }, ptr [[C]], i64 
[[INDEX]]
+; VF2-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+;
+; VF4-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
+; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias 
[[C:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 
[[INDEX]]
+; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
+; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], 
<8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], 
<8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT:    [[TMP1:%.*]] = fadd contract <4 x double> [[STRIDED_VEC]], 
splat (double 2.000000e+01)
+; VF4-NEXT:    [[TMP2:%.*]] = fadd contract <4 x double> [[STRIDED_VEC1]], 
splat (double 2.000000e+01)
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 
[[INDEX]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x 
double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP4]], 
<8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 
3, i32 7>
+; VF4-NEXT:    store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr { double, double }, ptr [[C]], i64 
[[INDEX]]
+; VF4-NEXT:    store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A = getelementptr { double, double }, ptr %A, i64 %iv
+  %l.A.0 = load double, ptr %gep.A, align 8
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 8
+  %add.0 = fadd contract double %l.A.0, 20.0
+  %add.1 = fadd contract double %l.A.1, 20.0
+  %gep.B = getelementptr { double, double }, ptr %B, i64 %iv
+  store double %add.0, ptr %gep.B, align 8
+  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
+  store double %add.1, ptr %gep.B.1, align 8
+  %gep.C = getelementptr { double, double }, ptr %C, i64 %iv
+  %gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
+  store double %add.0, ptr %gep.C, align 8
+  store double %add.1, ptr %gep.C.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %.not = icmp eq i64 %iv.next, 1000
+  br i1 %.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git 
a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
 
b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 6acd7989dbfd2..451cbd7601a85 100644
--- 
a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ 
b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -587,3 +587,76 @@ loop:
 exit:
   ret void
 }
+
+define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr 
noalias %B, ptr noalias %C) {
+; VF2-LABEL: define void @multiple_store_groups_storing_same_load_group(
+; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias 
[[C:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 
[[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 
[[INDEX]]
+; VF2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr { double, double }, ptr [[C]], i64 
[[INDEX]]
+; VF2-NEXT:    store <2 x double> [[WIDE_LOAD1]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+;
+; VF4-LABEL: define void @multiple_store_groups_storing_same_load_group(
+; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias 
[[C:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ 
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 
[[INDEX]]
+; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
+; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], 
<8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], 
<8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 
[[INDEX]]
+; VF4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[STRIDED_VEC]], <4 x 
double> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, 
i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP2]], 
<8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 
3, i32 7>
+; VF4-NEXT:    store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr { double, double }, ptr [[C]], i64 
[[INDEX]]
+; VF4-NEXT:    store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label 
%[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A = getelementptr { double, double }, ptr %A, i64 %iv
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
+  %l.A.0 = load double, ptr %gep.A, align 8
+  %l.A.1 = load double, ptr %gep.A.1, align 8
+  %gep.B = getelementptr { double, double }, ptr %B, i64 %iv
+  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
+  store double %l.A.0, ptr %gep.B, align 8
+  store double %l.A.1, ptr %gep.B.1, align 8
+  %gep.C = getelementptr { double, double }, ptr %C, i64 %iv
+  %gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
+  store double %l.A.0, ptr %gep.C, align 8
+  store double %l.A.1, ptr %gep.C.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %.not = icmp eq i64 %iv.next, 1000
+  br i1 %.not, label %exit, label %loop
+
+exit:
+  ret void
+}

_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] release/21.x: [VPlan] Don't narrow op multiple times in narrowInterle… (PR #158013)

Reply via email to