https://github.com/skc7 created https://github.com/llvm/llvm-project/pull/165746
None >From 760be5b3e38c6c47c9fc0c517e91c3e0e727a59c Mon Sep 17 00:00:00 2001 From: skc7 <[email protected]> Date: Thu, 30 Oct 2025 22:42:33 +0530 Subject: [PATCH] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline --- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 ------------------ llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 6 + llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 3 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 14 ++ ...amdgpu-lower-special-lds-and-module-lds.ll | 119 +++++++++++++++++ .../amdgpu-lower-special-lds-and-sw-lds.ll | 86 ++++++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 + .../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 2 +- 9 files changed, 236 insertions(+), 131 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a4ef524c43466..3c0328e93ffbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS { return KernelToCreatedDynamicLDS; } - static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, - Function *KF) { - bool NeedsReplacement = false; - for (Use &U : GV->uses()) { - if (auto *I = dyn_cast<Instruction>(U.getUser())) { - Function *F = I->getFunction(); - if (isKernelLDS(F) && F != KF) { - NeedsReplacement = true; - break; - } - } - } - if (!NeedsReplacement) - return GV; - // Create a new GV used only by this kernel and its function - GlobalVariable *NewGV = new GlobalVariable( - M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), - GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, - GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - NewGV->copyAttributesFrom(GV); - for (Use &U : make_early_inc_range(GV->uses())) { - if (auto *I = dyn_cast<Instruction>(U.getUser())) { - Function *F = I->getFunction(); - if (!isKernelLDS(F) || F == KF) { - U.getUser()->replaceUsesOfWith(GV, NewGV); - } - } - } - return NewGV; - } - - bool lowerSpecialLDSVariables( - Module &M, LDSUsesInfoTy &LDSUsesInfo, - VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { - bool Changed = false; - const DataLayout &DL = M.getDataLayout(); - // The 1st round: give module-absolute assignments - int NumAbsolutes = 0; - std::vector<GlobalVariable *> OrderedGVs; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - GlobalVariable *GV = K.first; - if (!isNamedBarrier(*GV)) - continue; - // give a module-absolute assignment if it is indirectly accessed by - // multiple kernels. This is not precise, but we don't want to duplicate - // a function when it is called by multiple kernels. - if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { - OrderedGVs.push_back(GV); - } else { - // leave it to the 2nd round, which will give a kernel-relative - // assignment if it is only indirectly accessed by one kernel - LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); - } - LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - NumAbsolutes += BarCnt; - - // 4 bits for alignment, 5 bits for the barrier num, - // 3 bits for the barrier scope - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, GV, Offset); - } - OrderedGVs.clear(); - - // The 2nd round: give a kernel-relative assignment for GV that - // either only indirectly accessed by single kernel or only directly - // accessed by multiple kernels. - std::vector<Function *> OrderedKernels; - for (auto &K : LDSUsesInfo.direct_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - OrderedKernels.push_back(F); - } - OrderedKernels = sortByName(std::move(OrderedKernels)); - - llvm::DenseMap<Function *, uint32_t> Kernel2BarId; - for (Function *F : OrderedKernels) { - for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { - if (!isNamedBarrier(*GV)) - continue; - - LDSUsesInfo.direct_access[F].erase(GV); - if (GV->isAbsoluteSymbolRef()) { - // already assigned - continue; - } - OrderedGVs.push_back(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - // GV could also be used directly by other kernels. If so, we need to - // create a new GV used only by this kernel and its function. - auto NewGV = uniquifyGVPerKernel(M, GV, F); - Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = Kernel2BarId[F]; - BarId += NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - Kernel2BarId[F] += BarCnt; - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, NewGV, Offset); - } - OrderedGVs.clear(); - } - // Also erase those special LDS variables from indirect_access. - for (auto &K : LDSUsesInfo.indirect_access) { - assert(isKernelLDS(K.first)); - for (GlobalVariable *GV : K.second) { - if (isNamedBarrier(*GV)) - K.second.erase(GV); - } - } - return Changed; - } - bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS { } } - if (LDSUsesInfo.HasSpecialGVs) { - // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( - M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); - } - // Partition variables accessed indirectly into the different strategies DenseSet<GlobalVariable *> ModuleScopeVariables; DenseSet<GlobalVariable *> TableLookupVariables; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e17c2113ca398..f7dff4ba4c5e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -273,6 +273,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. + // Named-barriers which are absolute symbols are removed + // from the maps. std::optional<bool> HasAbsoluteGVs; bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { @@ -284,6 +286,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { if (IsDirectMapDynLDSGV) continue; if (isNamedBarrier(*GV)) { + if (IsAbsolute) { + DirectMapKernel[Fn].erase(GV); + IndirectMapKernel[Fn].erase(GV); + } HasSpecialGVs = true; continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 4a9437b37aa39..827326ae90a75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -293,7 +293,8 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) { if (!AMDGPU::isLDSVariableToLower(*GV)) continue; - + if (isNamedBarrier(*GV)) + continue; for (User *V : GV->users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 916826ea169aa..33ef2e0dd5d71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -465,6 +465,11 @@ static cl::opt<bool> EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableLowerSpecialLDS("amdgpu-enable-lower-special-lds", + cl::desc("Enable lowering of special lds pass."), + cl::init(true), cl::Hidden); + static cl::opt<bool> EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " @@ -963,6 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + if (EnableLowerSpecialLDS) + PM.addPass(AMDGPULowerSpecialLDSPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1331,6 +1338,10 @@ void AMDGPUPassConfig::addIRPasses() { // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); + // Lower special LDS accesses. + if (EnableLowerSpecialLDS) + addPass(createAMDGPULowerSpecialLDSLegacyPass()); + // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); @@ -2076,6 +2087,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUExportKernelRuntimeHandlesPass()); + if (EnableLowerSpecialLDS) + addPass(AMDGPULowerSpecialLDSPass()); + if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll new file mode 100644 index 0000000000000..73cde6405ae1f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]] +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +;. +define void @func1() #0 { +; CHECK-LABEL: define void @func1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() #0 { +; CHECK-LABEL: define void @func2( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: call void @func1() +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + store i8 9, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel2( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @func2() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: [[META0]] = !{i32 8396816, i32 8396817} +; CHECK: [[META1]] = !{i32 8396912, i32 8396913} +; CHECK: [[META2]] = !{i32 8396848, i32 8396849} +; CHECK: [[META3]] = !{i32 0, i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll new file mode 100644 index 0000000000000..3127f1feac230 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.barkernel = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; +define void @foo() #0 { +; CHECK-LABEL: define void @foo( +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @bar() #0 { +; CHECK-LABEL: define void @bar( +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @fookernel() #0 { +; CHECK-LABEL: define amdgpu_kernel void @fookernel( +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) +; CHECK: call void @llvm.amdgcn.s.barrier() + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @foo() + call void @bar() + store i8 9, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @barkernel() #0 { +; CHECK-LABEL: define amdgpu_kernel void @barkernel( +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.barkernel, i32 9) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.barkernel) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @bar() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind sanitize_address } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 3aa36635a0ab6..eea353d3132df 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-special-lds,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-special-lds,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-special-lds,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 6e5212580ba2e..c47a5827d4f4d 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -39,6 +39,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles +; GCN-O0-NEXT: AMDGPU lowering of special LDS variables ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -187,6 +188,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles +; GCN-O1-NEXT: AMDGPU lowering of special LDS variables ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -474,6 +476,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles +; GCN-O1-OPTS-NEXT: AMDGPU lowering of special LDS variables ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -791,6 +794,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles +; GCN-O2-NEXT: AMDGPU lowering of special LDS variables ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1112,6 +1116,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles +; GCN-O3-NEXT: AMDGPU lowering of special LDS variables ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 03a666fbe3aea..4fd728dfc9191 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
