https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/152161
Finishes adding basic inline-asm callbr support for AMDGPU, started by https://github.com/llvm/llvm-project/pull/149308. >From 47408c30af9b0dd6b4c791130aad4a91efee8949 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler <robert.imschwei...@amd.com> Date: Tue, 5 Aug 2025 10:24:07 -0500 Subject: [PATCH] [AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with basic inline-asm Finishes adding basic inline-asm callbr support for AMDGPU, started by https://github.com/llvm/llvm-project/pull/149308. --- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 89 +++--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 15 +- llvm/test/CodeGen/AMDGPU/callbr.ll | 54 ++++ ...nify-divergent-exit-nodes-with-musttail.ll | 51 ++++ llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 258 ++++++++++++++++-- .../si-annotate-nested-control-flows.ll | 100 ++++++- .../si-unify-exit-multiple-unreachables.ll | 161 ++++++++++- llvm/test/CodeGen/AMDGPU/update-phi.ll | 39 +++ llvm/test/Transforms/StructurizeCFG/callbr.ll | 56 ++++ 10 files changed, 751 insertions(+), 82 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll create mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d520fb23..2df6bbc74f6da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,27 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (DummyReturnBB == nullptr) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (DummyReturnBB == nullptr) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dfe6f65d240e6..5f09cfbc6d817 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16476,12 +16476,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { - const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 's': - case 'r': + case 'r': { + const unsigned BitWidth = VT.getSizeInBits(); switch (BitWidth) { case 16: RC = &AMDGPU::SReg_32RegClass; @@ -16496,7 +16496,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, break; } break; - case 'v': + } + case 'v': { + const unsigned BitWidth = VT.getSizeInBits(); switch (BitWidth) { case 16: RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass @@ -16509,9 +16511,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, break; } break; + } case 'a': if (!Subtarget->hasMAIInsts()) break; + const unsigned BitWidth = VT.getSizeInBits(); switch (BitWidth) { case 16: RC = &AMDGPU::AGPR_32RegClass; diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index a69d64956d6d9..d43ed8a9364b8 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -480,11 +480,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -516,7 +515,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1284,13 +1283,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding blocks must not be modified by this pass. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000000000..253a6ec100eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %fallthrough +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT: ; %indirect +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[4:5], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %a = load i32, ptr %src, align 4 + callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: + store i32 %a, ptr %dst1, align 4 + br label %ret +indirect: + store i32 %a, ptr %dst2, align 4 + br label %ret +ret: + ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB1_1: ; %callbr +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_1 +; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT: ; %callbr.target.ret +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_setpc_b64 s[30:31] + br label %callbr +callbr: + callbr void asm "", "!i"() to label %callbr [label %ret] +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0a6bdbc..076a99ff8588f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@ declare void @foo(ptr) declare i1 @bar(ptr) +declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1: ret void } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0: bb.1: ret i1 %load } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + %ret = musttail call i32 @bar32(ptr %p) + ret i32 %ret + +bb.1: + ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index bea532bd52955..2181c77f0e6ef 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop: br label %loop } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP:%.*]] [] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", ""() to label %loop [] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] +} + define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB1_2: ; %loop +; SI-NEXT: .LBB2_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB1_2 -; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -81,44 +115,93 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB3_2: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond = icmp eq i32 %tmp, 1 + %cond32 = zext i1 %cond to i32 + callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] + +return: + ret void +} + define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB2_4 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop2 +; SI-NEXT: .LBB4_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB2_4: ; %Flow2 +; SI-NEXT: .LBB4_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_7 +; SI-NEXT: s_cbranch_vccz .LBB4_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB2_6: ; %loop1 +; SI-NEXT: .LBB4_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_6 -; SI-NEXT: .LBB2_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB4_6 +; SI-NEXT: .LBB4_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -144,24 +227,78 @@ loop2: br label %loop2 } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB5_2: ; Inline asm indirect target +; SI-NEXT: ; %loop2.preheader +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x378 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loops_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", "r,!i"(i32 poison) +; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2] +; IR: loop1: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP1]] [] +; IR: loop2: +; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] +; IR: TransitionBlock1: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP2:%.*]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] + +loop1: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop1 [] + +loop2: + store volatile i32 888, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop2 [] +} + define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB3_5 +; SI-NEXT: s_cbranch_execz .LBB6_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: .LBB3_2: ; %outer_loop +; SI-NEXT: .LBB6_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 -; SI-NEXT: ; Child Loop BB3_3 Depth 2 +; SI-NEXT: ; Child Loop BB6_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB3_3: ; %inner_loop -; SI-NEXT: ; Parent Loop BB3_2 Depth=1 +; SI-NEXT: .LBB6_3: ; %inner_loop +; SI-NEXT: ; Parent Loop BB6_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] @@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_cbranch_execnz .LBB6_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_branch .LBB3_2 -; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock +; SI-NEXT: s_branch .LBB6_2 +; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( ; IR-NEXT: entry: @@ -212,4 +349,83 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_nest_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %outer_loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB7_3 +; SI-NEXT: .LBB7_2: ; %loop.exit.guard +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_cbranch_vccnz .LBB7_5 +; SI-NEXT: .LBB7_3: ; %outer_loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_vccz .LBB7_2 +; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_5: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_nest_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 +; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]]) +; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: outer_loop: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[INNER_LOOP:%.*]] [] +; IR: inner_loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 +; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]]) +; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination + %cond1_32 = zext i1 %cond1 to i32 + callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] + +outer_loop: + ; %cond2 = icmp eq i32 %tmp, 2 + ; br i1 %cond2, label %outer_loop, label %inner_loop + callbr void asm "", ""() to label %inner_loop [] + +inner_loop: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 999, ptr addrspace(1) %out, align 4 + %cond3 = icmp eq i32 %tmp, 3 + %cond3_32 = zext i1 %cond3 to i32 + callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] + +return: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..01bcdad3fc220 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,15 +3,16 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: @nested_inf_loop( -; OPT-NEXT: BB: -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: BB1: -; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] -; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] -; OPT: infloop: -; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] -; OPT: DummyReturnBlock: +; OPT-LABEL: define void @nested_inf_loop( +; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] +; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] +; OPT: [[INFLOOP]]: +; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[DUMMYRETURNBLOCK]]: ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: @@ -63,3 +64,84 @@ BB4: BB3: br label %BB1 } + +define void @nested_inf_loop_callbr(i32 %0, i32 %1) { +; OPT-LABEL: define void @nested_inf_loop_callbr( +; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1:.*]] [] +; OPT: [[BB1]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]]) +; OPT-NEXT: to label %[[BB3:.*]] [label %BB2] +; OPT: [[BB2:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB4:.*]] [] +; OPT: [[BB4]]: +; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[TRANSITIONBLOCK]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]]) +; OPT-NEXT: to label %[[BB3]] [label %BB4] +; OPT: [[BB3]]: +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1]] [] +; OPT: [[DUMMYRETURNBLOCK]]: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop_callbr: +; ISA: ; %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT: .LBB1_1: ; %BB1 +; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec +; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; ISA-NEXT: .LBB1_2: ; %BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec +; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; ISA-NEXT: s_branch .LBB1_1 +; ISA-NEXT: .LBB1_3: ; Inline asm indirect target +; ISA-NEXT: ; %BB2 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ; Label of block must be emitted +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_mov_b64 s[6:7], -1 +; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; ISA-NEXT: s_cbranch_execz .LBB1_5 +; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: .LBB1_5: ; %loop.exit.guard +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_and_b64 vcc, exec, s[6:7] +; ISA-NEXT: s_mov_b64 s[6:7], 0 +; ISA-NEXT: s_cbranch_vccz .LBB1_2 +; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] +BB: + callbr void asm "", ""() to label %BB1 [] + +BB1: + callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] + +BB2: + callbr void asm "", ""() to label %BB4 [] + +BB4: + callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] + +BB3: + callbr void asm "", ""() to label %BB1 [] +} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 2dfb72a08cffc..ee7df73c849f7 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs | FileCheck %s declare void @llvm.trap() @@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - - +; UNIFY-LABEL: @kernel( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: br label [[IF_END6]] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -105,5 +130,129 @@ if.end6.sink.split: if.end6: ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; UNIFY: {{.*}} + +define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { +; CHECK-LABEL: kernel_callbr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %if.then +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target +; CHECK-NEXT: ; %UnifiedReturnBlock +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target +; CHECK-NEXT: ; %if.else +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.5: ; %if.then3 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_2 +; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false.i8 +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_trap 2 +; CHECK-NEXT: ; divergent unreachable +; CHECK-NEXT: s_branch .LBB1_3 +; UNIFY-LABEL: @kernel_callbr( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]]) +; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]]) +; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: callbr void asm "", ""() +; UNIFY-NEXT: to label [[IF_END6:%.*]] [] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %n, 256 + %cmp32 = zext i1 %cmp to i32 + callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] + +if.then: + %cmp1 = icmp eq i32 %a, 0 + %cmp1_32 = zext i1 %cmp1 to i32 + callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] + +cond.false: + call void @llvm.trap() + unreachable + +if.else: + %cmp2 = icmp ult i32 %tid, 10 + %cmp2_32 = zext i1 %cmp2 to i32 + callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] + +if.then3: + %cmp1.i7 = icmp eq i32 %a, 0 + %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 + callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] + +cond.false.i8: + call void @llvm.trap() + unreachable + +if.end6.sink.split: + %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid + store i32 %a, ptr addrspace(1) %x1, align 4 + callbr void asm "", ""() to label %if.end6 [] + +if.end6: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 50666bee325e8..684dc1a1f0092 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,3 +37,42 @@ n28: ; preds = %.loopexit, %n28 n31: ; preds = ret void } + +define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { +; IR-LABEL: @_amdgpu_ps_main_callbr( +; IR-NEXT: .entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] [] +; IR: .loopexit: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[N28:%.*]] [] +; IR: n28: +; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] +; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 +; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 +; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]]) +; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28] +; IR: n31: +; IR-NEXT: ret void +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +.entry: + callbr void asm "", ""() to label %.loopexit [] + +.loopexit: ; preds = %n28, %.entry + callbr void asm "", ""() to label %n28 [] + +n28: ; preds = %.loopexit, %n28 + %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] + %n29 = fadd float %.01, 1.0 + %n30 = fcmp ogt float %n29, 4.000000e+00 + %n30.32 = zext i1 %n30 to i32 + callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] + +n31: ; preds = + ret void +} diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll new file mode 100644 index 0000000000000..7bc973397d5a1 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -structurizecfg %s -o - | FileCheck %s +; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s + +; Structurize as usual, but don't tear callbr and its destination blocks apart. + +define void @callbr_inline_asm(i32 %c, i1 %d, i1 %e) { +; CHECK-LABEL: define void @callbr_inline_asm( +; CHECK-SAME: i32 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]]) { +; CHECK-NEXT: [[D_INV:%.*]] = xor i1 [[D]], true +; CHECK-NEXT: br i1 [[D_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]] +; CHECK: [[FLOW3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[E]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW2]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br i1 [[E]], label %[[NOCALLBR1:.*]], label %[[FLOW]] +; CHECK: [[NOCALLBR1]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: br label %[[FLOW3]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[FLOW2]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; + br i1 %d, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %e, label %fallthrough1, label %ret +fallthrough1: + br label %ret +indirect: + br i1 %e, label %indirect1, label %ret +indirect1: + br label %ret +nocallbr: + br i1 %e, label %nocallbr1, label %ret +nocallbr1: + br label %ret +ret: + ret void +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits