Author: Leon Clark Date: 2025-08-12T14:08:37+01:00 New Revision: 9115bef8ee1d9a47fef3221d9f3e0fd5c4ed947c
URL: https://github.com/llvm/llvm-project/commit/9115bef8ee1d9a47fef3221d9f3e0fd5c4ed947c DIFF: https://github.com/llvm/llvm-project/commit/9115bef8ee1d9a47fef3221d9f3e0fd5c4ed947c.diff LOG: [VectorCombine] Shrink loads used in shufflevector rebroadcasts. (#153138) Reopen #128938. Attempt to shrink the size of vector loads where only some of the incoming lanes are used for rebroadcasts in shufflevector instructions. --------- Co-authored-by: Leon Clark <leocl...@amd.com> Co-authored-by: Simon Pilgrim <llvm-...@redking.me.uk> Added: llvm/test/Transforms/VectorCombine/load-shufflevector.ll Modified: clang/test/CodeGenOpenCL/preserve_vec3.cl llvm/lib/Transforms/Vectorize/VectorCombine.cpp llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll llvm/test/Transforms/VectorCombine/X86/load-widening.ll llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll Removed: ################################################################################ diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl index 49ebae6fc7013..e73657e30d884 100644 --- a/clang/test/CodeGenOpenCL/preserve_vec3.cl +++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl @@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4))); // CHECK-LABEL: define dso_local spir_kernel void @foo( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: ret void // @@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2( // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> -// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void kernel float3_to_double2(global float3 *a, global double2 *b) { @@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) { // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3( // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d9a495782967c..d67a2304368eb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -29,13 +30,16 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <numeric> +#include <optional> #include <queue> #include <set> +#include <tuple> #define DEBUG_TYPE "vector-combine" #include "llvm/Transforms/Utils/InstructionWorklist.h" @@ -137,6 +141,7 @@ class VectorCombine { bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); bool shrinkType(Instruction &I); + bool shrinkLoadForShuffles(Instruction &I); void replaceValue(Value &Old, Value &New) { LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); @@ -3862,6 +3867,133 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { return true; } +// Attempt to shrink loads that are only used by shufflevector instructions. +bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { + auto *OldLoad = dyn_cast<LoadInst>(&I); + if (!OldLoad || !OldLoad->isSimple()) + return false; + + auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType()); + if (!OldLoadTy) + return false; + + unsigned const OldNumElements = OldLoadTy->getNumElements(); + + // Search all uses of load. If all uses are shufflevector instructions, and + // the second operands are all poison values, find the minimum and maximum + // indices of the vector elements referenced by all shuffle masks. + // Otherwise return `std::nullopt`. + using IndexRange = std::pair<int, int>; + auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> { + IndexRange OutputRange = IndexRange(OldNumElements, -1); + for (llvm::Use &Use : I.uses()) { + // Ensure all uses match the required pattern. + User *Shuffle = Use.getUser(); + ArrayRef<int> Mask; + + if (!match(Shuffle, + m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask)))) + return std::nullopt; + + // Ignore shufflevector instructions that have no uses. + if (Shuffle->use_empty()) + continue; + + // Find the min and max indices used by the shufflevector instruction. + for (int Index : Mask) { + if (Index >= 0 && Index < static_cast<int>(OldNumElements)) { + OutputRange.first = std::min(Index, OutputRange.first); + OutputRange.second = std::max(Index, OutputRange.second); + } + } + } + + if (OutputRange.second < OutputRange.first) + return std::nullopt; + + return OutputRange; + }; + + // Get the range of vector elements used by shufflevector instructions. + if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) { + unsigned const NewNumElements = Indices->second + 1u; + + // If the range of vector elements is smaller than the full load, attempt + // to create a smaller load. + if (NewNumElements < OldNumElements) { + IRBuilder Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + // Calculate costs of old and new ops. + Type *ElemTy = OldLoadTy->getElementType(); + FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements); + Value *PtrOp = OldLoad->getPointerOperand(); + + InstructionCost OldCost = TTI.getMemoryOpCost( + Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); + + using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>; + SmallVector<UseEntry, 4u> NewUses; + unsigned const MaxIndex = NewNumElements * 2u; + + for (llvm::Use &Use : I.uses()) { + auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser()); + ArrayRef<int> OldMask = Shuffle->getShuffleMask(); + + // Create entry for new use. + NewUses.push_back({Shuffle, OldMask}); + + // Validate mask indices. + for (int Index : OldMask) { + if (Index >= static_cast<int>(MaxIndex)) + return false; + } + + // Update costs. + OldCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), + OldLoadTy, OldMask, CostKind); + NewCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), + NewLoadTy, OldMask, CostKind); + } + + LLVM_DEBUG( + dbgs() << "Found a load used only by shufflevector instructions: " + << I << "\n OldCost: " << OldCost + << " vs NewCost: " << NewCost << "\n"); + + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // Create new load of smaller vector. + auto *NewLoad = cast<LoadInst>( + Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign())); + NewLoad->copyMetadata(I); + + // Replace all uses. + for (UseEntry &Use : NewUses) { + ShuffleVectorInst *Shuffle = Use.first; + std::vector<int> &NewMask = Use.second; + + Builder.SetInsertPoint(Shuffle); + Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); + Value *NewShuffle = Builder.CreateShuffleVector( + NewLoad, PoisonValue::get(NewLoadTy), NewMask); + + replaceValue(*Shuffle, *NewShuffle); + } + + return true; + } + } + return false; +} + /// This is the entry point for all transforms. Pass manager diff erences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -3938,6 +4070,9 @@ bool VectorCombine::run() { MadeChange |= foldSelectShuffle(I); MadeChange |= foldShuffleToIdentity(I); break; + case Instruction::Load: + MadeChange |= shrinkLoadForShuffles(I); + break; case Instruction::BitCast: MadeChange |= foldBitcastShuffle(I); break; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll index f6e8fcd5d1d8c..86bdd125e5e57 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll @@ -11,13 +11,13 @@ $getAt = comdat any define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 { ; SSE-LABEL: @ConvertVectors_ByRef( -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2> +; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2> ; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @ConvertVectors_ByRef( -; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2> +; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2> ; AVX-NEXT: ret <4 x float> [[TMP3]] ; %2 = alloca ptr, align 8 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 977da754ec5a7..0c2346e616e36 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 @@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll index 30a089818074e..eacc40bfa9b53 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v2i32_v4i32_asan( -; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll index b30dc9ffdc596..eddfc57a7d256 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles) define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0> -; SSE-NEXT: ret <4 x double> [[BLEND]] -; -; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3> -; AVX-NEXT: ret <4 x double> [[BLEND]] +; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64( +; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32 +; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0> +; CHECK-NEXT: ret <4 x double> [[BLEND]] ; %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 @@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) { %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3> ret <2 x float> %s2 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE: {{.*}} diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll new file mode 100644 index 0000000000000..7d9393ab77f20 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=vector-combine -S < %s | FileCheck %s + +define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load volatile <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2> + ret <8 x half> %val1 +} + +define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2> +; CHECK-NEXT: ret <4 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2> + ret <4 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: ret <8 x half> [[TMP1]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2> + ret <8 x half> %val1 +} + +define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x half> [[VAL3]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + br label %finally + +else: + %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + br label %finally + +finally: + %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ] + ret <8 x half> %val3 +} + +define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <4 x half> [[VAL3]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + br label %finally + +else: + %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> + br label %finally + +finally: + %val3 = phi <4 x half> [ %val1, %then ], [ %val2, %else ] + ret <4 x half> %val3 +} + +define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x half> [[VAL3]] +; +entry: + %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + br label %finally + +else: + %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + br label %finally + +finally: + %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ] + ret <8 x half> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x i32> %val1 +} + +define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2> + ret <8 x i32> %val1 +} + +define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2> +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2> + ret <4 x i32> %val1 +} + +define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2> + ret <8 x i32> %val1 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} + +define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <4 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> + br label %finally + +finally: + %val3 = phi <4 x i32> [ %val1, %then ], [ %val2, %else ] + ret <4 x i32> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} + +define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr { +; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_4( +; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br label %[[FINALLY:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4> +; CHECK-NEXT: br label %[[FINALLY]] +; CHECK: [[FINALLY]]: +; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ] +; CHECK-NEXT: ret <8 x i32> [[VAL3]] +; +entry: + %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32 + br i1 %cond, label %then, label %else + +then: + %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2> + br label %finally + +else: + %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4> + br label %finally + +finally: + %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ] + ret <8 x i32> %val3 +} + +define <16 x i8> @shuffle_v16_v16i8_r0_31(ptr %arg) { +; CHECK-LABEL: define <16 x i8> @shuffle_v16_v16i8_r0_31( +; CHECK-SAME: ptr [[ARG:%.*]]) { +; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[ARG]], align 1 +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> +; CHECK-NEXT: ret <16 x i8> [[SHUF]] +; + %load= load <16 x i8>, ptr %arg, align 1 + %shuf = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %shuf +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits