llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Robert Imschweiler (ro-i) <details> <summary>Changes</summary> Finishes adding basic inline-asm callbr support for AMDGPU, started by https://github.com/llvm/llvm-project/pull/149308. --- Patch is 42.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152161.diff 10 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp (+54-35) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7-3) - (modified) llvm/lib/Transforms/Scalar/StructurizeCFG.cpp (+7-8) - (added) llvm/test/CodeGen/AMDGPU/callbr.ll (+54) - (modified) llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll (+51) - (modified) llvm/test/CodeGen/AMDGPU/infinite-loop.ll (+237-21) - (modified) llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll (+91-9) - (modified) llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll (+155-6) - (modified) llvm/test/CodeGen/AMDGPU/update-phi.ll (+39) - (added) llvm/test/Transforms/StructurizeCFG/callbr.ll (+56) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d520fb23..2df6bbc74f6da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,27 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (DummyReturnBB == nullptr) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (DummyReturnBB == nullptr) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dfe6f65d240e6..5f09cfbc6d817 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16476,12 +16476,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { - const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 's': - case 'r': + case 'r': { + const unsigned BitWidth = VT.getSizeInBits(); switch (BitWidth) { case 16: RC = &AMDGPU::SReg_32RegClass; @@ -16496,7 +16496,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, break; } break; - case 'v': + } + case 'v': { + const unsigned BitWidth = VT.getSizeInBits(); switch (BitWidth) { case 16: RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass @@ -16509,9 +16511,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, break; } break; + } case 'a': if (!Subtarget->hasMAIInsts()) break; + const unsigned BitWidth = VT.getSizeInBits(); switch (BitWidth) { case 16: RC = &AMDGPU::AGPR_32RegClass; diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index a69d64956d6d9..d43ed8a9364b8 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -480,11 +480,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -516,7 +515,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1284,13 +1283,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding blocks must not be modified by this pass. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000000000..253a6ec100eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %fallthrough +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT: ; %indirect +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[4:5], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %a = load i32, ptr %src, align 4 + callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: + store i32 %a, ptr %dst1, align 4 + br label %ret +indirect: + store i32 %a, ptr %dst2, align 4 + br label %ret +ret: + ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB1_1: ; %callbr +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_1 +; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT: ; %callbr.target.ret +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_setpc_b64 s[30:31] + br label %callbr +callbr: + callbr void asm "", "!i"() to label %callbr [label %ret] +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0a6bdbc..076a99ff8588f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@ declare void @foo(ptr) declare i1 @bar(ptr) +declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1: ret void } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0: bb.1: ret i1 %load } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + %ret = musttail call i32 @bar32(ptr %p) + ret i32 %ret + +bb.1: + ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index bea532bd52955..2181c77f0e6ef 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop: br label %loop } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP:%.*]] [] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", ""() to label %loop [] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] +} + define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB1_2: ; %loop +; SI-NEXT: .LBB2_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB1_2 -; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -81,44 +115,93 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB3_2: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond = icmp eq i32 %tmp, 1 + %cond32 = zext i1 %cond to i32 + callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] + +return: + ret void +} + define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB2_4 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop2 +; SI-NEXT: .LBB4_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB2_4: ; %Flow2 +; SI-NEXT: .LBB4_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_7 +; SI-NEXT: s_cbranch_vccz .LBB4_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB2_6: ; %loop1 +; SI-NEXT: .LBB4_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_6 -; SI-NEXT: .LBB2_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB4_6 +; SI-NEXT: .LBB4_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -144,24 +227,78 @@ loop2: br label %loop2 } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB5_2: ; Inline asm indirect target +; SI-NEXT: ; %loop2.preheader +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/152161 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits