https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/158013
…aveGroups. Track which ops already have been narrowed, to avoid narrowing the same operation multiple times. Repeated narrowing will lead to incorrect results, because we could first narrow from an interleave group -> wide load, and then narrow the wide load > single-scalar load. Fixes thttps://github.com/llvm/llvm-project/issues/156190. >From 93505953fea754e6bbb1edb5fca75097132377b5 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Wed, 10 Sep 2025 17:09:49 +0100 Subject: [PATCH] release/21.x: [VPlan] Don't narrow op multiple times in narrowInterleaveGroups. Track which ops already have been narrowed, to avoid narrowing the same operation multiple times. Repeated narrowing will lead to incorrect results, because we could first narrow from an interleave group -> wide load, and then narrow the wide load > single-scalar load. Fixes thttps://github.com/llvm/llvm-project/issues/156190. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 8 +- ...nterleave-to-widen-memory-with-wide-ops.ll | 79 +++++++++++++++++++ ...sform-narrow-interleave-to-widen-memory.ll | 73 +++++++++++++++++ 3 files changed, 158 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6a3b3e6e41955..f7c1c10185c68 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3252,9 +3252,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, return; // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. - auto NarrowOp = [](VPValue *V) -> VPValue * { + SmallPtrSet<VPValue *, 4> NarrowedOps; + auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { auto *R = V->getDefiningRecipe(); - if (!R) + if (!R || NarrowedOps.contains(V)) return V; if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { // Narrow interleave group to wide load, as transformed VPlan will only @@ -3264,6 +3265,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); + NarrowedOps.insert(L); return L; } @@ -3271,6 +3273,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, assert(RepR->isSingleScalar() && isa<LoadInst>(RepR->getUnderlyingInstr()) && "must be a single scalar load"); + NarrowedOps.insert(RepR); return RepR; } auto *WideLoad = cast<VPWidenLoadRecipe>(R); @@ -3281,6 +3284,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, WideLoad->operands(), /*IsUniform*/ true, /*Mask*/ nullptr, *WideLoad); N->insertBefore(WideLoad); + NarrowedOps.insert(N); return N; }; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll index 813d61b52100f..aec6c0be6dde2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll @@ -1203,3 +1203,82 @@ loop: exit: ret void } + +; Make sure multiple uses of a narrowed op are handled correctly, +; https://github.com/llvm/llvm-project/issues/156190. +define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, ptr noalias %B, ptr noalias %C) { +; VF2-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op( +; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]] +; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = fadd contract <2 x double> [[BROADCAST_SPLAT]], splat (double 2.000000e+01) +; VF2-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] +; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 8 +; VF2-NEXT: [[TMP4:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] +; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; VF2: [[SCALAR_PH]]: +; +; VF4-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op( +; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8 +; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> +; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> +; VF4-NEXT: [[TMP1:%.*]] = fadd contract <4 x double> [[STRIDED_VEC]], splat (double 2.000000e+01) +; VF4-NEXT: [[TMP2:%.*]] = fadd contract <4 x double> [[STRIDED_VEC1]], splat (double 2.000000e+01) +; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] +; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF4-NEXT: [[TMP5:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] +; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; VF4: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr { double, double }, ptr %A, i64 %iv + %l.A.0 = load double, ptr %gep.A, align 8 + %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8 + %l.A.1 = load double, ptr %gep.A.1, align 8 + %add.0 = fadd contract double %l.A.0, 20.0 + %add.1 = fadd contract double %l.A.1, 20.0 + %gep.B = getelementptr { double, double }, ptr %B, i64 %iv + store double %add.0, ptr %gep.B, align 8 + %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8 + store double %add.1, ptr %gep.B.1, align 8 + %gep.C = getelementptr { double, double }, ptr %C, i64 %iv + %gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8 + store double %add.0, ptr %gep.C, align 8 + store double %add.1, ptr %gep.C.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %.not = icmp eq i64 %iv.next, 1000 + br i1 %.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll index 6acd7989dbfd2..451cbd7601a85 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll @@ -587,3 +587,76 @@ loop: exit: ret void } + +define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr noalias %B, ptr noalias %C) { +; VF2-LABEL: define void @multiple_store_groups_storing_same_load_group( +; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] +; VF2-NEXT: store <2 x double> [[WIDE_LOAD]], ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] +; VF2-NEXT: store <2 x double> [[WIDE_LOAD1]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; VF2: [[SCALAR_PH]]: +; +; VF4-LABEL: define void @multiple_store_groups_storing_same_load_group( +; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8 +; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> +; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> +; VF4-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] +; VF4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[STRIDED_VEC]], <4 x double> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] +; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; VF4: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr { double, double }, ptr %A, i64 %iv + %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8 + %l.A.0 = load double, ptr %gep.A, align 8 + %l.A.1 = load double, ptr %gep.A.1, align 8 + %gep.B = getelementptr { double, double }, ptr %B, i64 %iv + %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8 + store double %l.A.0, ptr %gep.B, align 8 + store double %l.A.1, ptr %gep.B.1, align 8 + %gep.C = getelementptr { double, double }, ptr %C, i64 %iv + %gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8 + store double %l.A.0, ptr %gep.C, align 8 + store double %l.A.1, ptr %gep.C.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %.not = icmp eq i64 %iv.next, 1000 + br i1 %.not, label %exit, label %loop + +exit: + ret void +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits