Author: Stanislav Mekhanoshin Date: 2025-08-12T15:07:07-07:00 New Revision: d0ee82040cb22ae38c92eb83b0c9ae71ca51a517
URL: https://github.com/llvm/llvm-project/commit/d0ee82040cb22ae38c92eb83b0c9ae71ca51a517 DIFF: https://github.com/llvm/llvm-project/commit/d0ee82040cb22ae38c92eb83b0c9ae71ca51a517.diff LOG: [AMDGPU] Add s_barrier_init|join|leave instructions (#153296) Added: llvm/test/CodeGen/AMDGPU/s-barrier.ll Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.h llvm/lib/Target/AMDGPU/SOPInstructions.td llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll llvm/test/MC/AMDGPU/gfx12_asm_sop1.s llvm/test/MC/AMDGPU/gfx12_asm_sopp.s llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index b16d4a22207a7..f8f55772db8fe 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -503,6 +503,9 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vv*i", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vv*i", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vv*", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "vIs", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl index 5d86a9b369429..1a5043328895a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl @@ -23,6 +23,13 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global *out = *in; } +kernel void builtins_amdgcn_s_barrier_leave_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal(-1); + __builtin_amdgcn_s_barrier_leave(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_leave' must be a constant integer}} + *out = *in; +} + void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off) { __builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index f7641280715c8..8c02616780182 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -139,6 +139,50 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c) __builtin_amdgcn_s_barrier_wait(1); } +// CHECK-LABEL: @test_s_barrier_init( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_s_barrier_init(void *bar, int a) +{ + __builtin_amdgcn_s_barrier_init(bar, a); +} + +// CHECK-LABEL: @test_s_barrier_join( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_s_barrier_join(void *bar) +{ + __builtin_amdgcn_s_barrier_join(bar); +} + +// CHECK-LABEL: @test_s_barrier_leave( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.leave(i16 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_leave() +{ + __builtin_amdgcn_s_barrier_leave(1); +} + // CHECK-LABEL: @test_s_get_barrier_state( // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index be6df257f668b..cf82f7f06a693 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -290,11 +290,29 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt) +// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. +def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, + Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier) +// The %barrier argument must be uniform, otherwise behavior is undefined. +def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, + Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + // void @llvm.amdgcn.s.barrier.wait(i16 %barrierType) def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// void @llvm.amdgcn.s.barrier.leave(i16 %barrierType) +def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, + Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + // uint32_t @llvm.amdgcn.s.get.barrier.state(i32 %barrierId) // The %barrierType argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 402147abd8891..5d31eed8fe7d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_get_named_barrier_state: return selectNamedBarrierInst(I, IntrinsicID); case Intrinsic::amdgcn_s_get_barrier_state: @@ -6772,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_IMM; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_IMM; }; @@ -6779,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_M0; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_M0; }; @@ -6829,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4); constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; MachineInstrBuilder MIB; - MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0)); + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); I.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 31f5ba1dd5040..092439693f399 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,6 +156,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierLeave(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index aa72c3e61f680..e65dd1b04cc48 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_barrier_leave: case Intrinsic::amdgcn_s_get_barrier_state: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 868b1a21e3cd5..237929699dd9d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(1).empty()); constrainOpWithReadfirstlane(B, MI, 1); return; + case Intrinsic::amdgcn_s_barrier_join: + constrainOpWithReadfirstlane(B, MI, 1); + return; + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: constrainOpWithReadfirstlane(B, MI, 1); constrainOpWithReadfirstlane(B, MI, 2); @@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_s_sleep_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; + case Intrinsic::amdgcn_s_barrier_join: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ebd79d9e8f5cf..2e76225bbc542 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10825,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: { // these two intrinsics have two operands: barrier pointer and member count SDValue Chain = Op->getOperand(0); @@ -10832,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue BarOp = Op->getOperand(2); SDValue CntOp = Op->getOperand(3); SDValue M0Val; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; // extract the BarrierID from bits 4-9 of BarOp SDValue BarID; BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, @@ -10855,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); - auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL, - Op->getVTList(), Ops); + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } + case Intrinsic::amdgcn_s_barrier_join: { + // these three intrinsics have one operand: barrier pointer + SDValue Chain = Op->getOperand(0); + SmallVector<SDValue, 2> Ops; + SDValue BarOp = Op->getOperand(2); + unsigned Opc; + + if (isa<ConstantSDNode>(BarOp)) { + uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue(); + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + + // extract the BarrierID from bits 4-9 of the immediate + unsigned BarID = (BarVal >> 4) & 0x3F; + SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); + Ops.push_back(K); + Ops.push_back(Chain); + } else { + Opc = AMDGPU::S_BARRIER_JOIN_M0; + + // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] + SDValue M0Val; + M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, + DAG.getShiftAmountConstant(4, MVT::i32, DL)); + M0Val = + SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + } + + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } case Intrinsic::amdgcn_s_prefetch_data: { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4b48fc4ab8824..343e4550e1d93 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, case AMDGPU::S_MEMREALTIME: case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: + case AMDGPU::S_BARRIER_LEAVE: case AMDGPU::S_GET_BARRIER_STATE_M0: case AMDGPU::S_GET_BARRIER_STATE_IMM: ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 6b9403f9c7a21..18f0e5b9b56bc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -996,6 +996,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isBarrier(unsigned Opcode) const { return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_INIT_M0 || + Opcode == AMDGPU::S_BARRIER_INIT_IMM || + Opcode == AMDGPU::S_BARRIER_JOIN_IMM || + Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_LEAVE_IMM || Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 431d73b9a95b5..c2f4dbfa247d1 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o let isConvergent = 1; } +def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + } // End Uses = [M0] def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), @@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out let isConvergent = 1; } +def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + } // End has_sdst = 0 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), @@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm let isConvergent = 1; } +def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let fixed_imm = 1; + let isConvergent = 1; + let Defs = [SCC]; +} + +def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave", + (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>; + def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let SubtargetPredicate = isGFX8Plus; let simm16 = 0; @@ -2144,9 +2179,13 @@ defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>; defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>; defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>; defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>; +defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>; defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>; defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>; defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>; +defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>; defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; @@ -2639,6 +2678,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> { } defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; +defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>; defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>; defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>; diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir index f437dee253d00..d33a809a2ee88 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir @@ -489,3 +489,126 @@ body: | S_ENDPGM 0 ... +--- +name: skip_barrier_init_imm +body: | + ; CHECK-LABEL: name: skip_barrier_init_imm + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_BARRIER_INIT_IMM -1, implicit $m0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + $m0 = S_MOV_B32 -1 + S_BARRIER_INIT_IMM -1, implicit $m0 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_init_m0 +body: | + ; CHECK-LABEL: name: skip_barrier_init_m0 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 -1 + ; CHECK-NEXT: S_BARRIER_INIT_M0 implicit $m0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + $m0 = S_MOV_B32 -1 + S_BARRIER_INIT_M0 implicit $m0 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_join_imm +body: | + ; CHECK-LABEL: name: skip_barrier_join_imm + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_JOIN_IMM -1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_JOIN_IMM -1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: skip_barrier_leave +body: | + ; CHECK-LABEL: name: skip_barrier_leave + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_NOP_e32 implicit $exec + ; CHECK-NEXT: S_BARRIER_LEAVE implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + V_NOP_e32 implicit $exec + S_BARRIER_LEAVE implicit-def $scc + + bb.2: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 1f2b3e2c31892..7cf8883082458 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -11,12 +11,14 @@ define void @func1() { call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) call void @llvm.amdgcn.s.barrier.wait(i16 1) ret void } define void @func2() { call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) call void @llvm.amdgcn.s.barrier.wait(i16 1) ret void } @@ -24,6 +26,7 @@ define void @func2() { define amdgpu_kernel void @kernel1() #0 { ; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) call void @llvm.amdgcn.s.barrier.wait(i16 1) call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1) %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) @@ -36,6 +39,7 @@ define amdgpu_kernel void @kernel1() #0 { define amdgpu_kernel void @kernel2() #0 { ; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) call void @llvm.amdgcn.s.barrier.wait(i16 1) call void @func2() @@ -47,6 +51,9 @@ declare void @llvm.amdgcn.s.barrier.wait(i16) #1 declare void @llvm.amdgcn.s.barrier.signal(i32) #1 declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll new file mode 100644 index 0000000000000..1821bd45dc1cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -0,0 +1,275 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison + +define void @func1() { +; GFX12-SDAG-LABEL: func1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 3 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: func1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_join 3 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() { +; GFX12-SDAG-LABEL: func2: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: func2: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_join 1 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +; GFX12-SDAG-LABEL: kernel1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-SDAG-NEXT: s_barrier_init m0 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48 +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63 +; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s3 +; GFX12-SDAG-NEXT: s_barrier_init m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s3 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_barrier_signal -1 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_barrier_leave +; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+8 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+16 +; GFX12-SDAG-NEXT: s_barrier_signal -1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-SDAG-NEXT: s_barrier_wait -1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: kernel1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_load_b32 s0, s[12:13], 0x2c +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63 +; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000 +; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 +; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 +; GFX12-GISEL-NEXT: s_barrier_join m0 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_barrier_leave +; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+16 +; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_barrier_wait -1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1 +; GFX12-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12) + call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9) + call void @llvm.amdgcn.s.barrier.signal(i32 -1) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.leave(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar) + %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1) + ret void +} + +define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +; GFX12-SDAG-LABEL: kernel2: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7] +; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+8 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+16 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: kernel2: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5] +; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+16 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002 +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_join 2 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX12-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + + call void @func2() + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s index b7d4764335a53..949847e2dec3b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s @@ -725,6 +725,24 @@ s_barrier_signal_isfirst -1 s_barrier_signal_isfirst m0 // GFX12: s_barrier_signal_isfirst m0 ; encoding: [0x7d,0x4f,0x80,0xbe] +s_barrier_init -1 +// GFX12: s_barrier_init -1 ; encoding: [0xc1,0x51,0x80,0xbe] + +s_barrier_init -2 +// GFX12: s_barrier_init -2 ; encoding: [0xc2,0x51,0x80,0xbe] + +s_barrier_init m0 +// GFX12: s_barrier_init m0 ; encoding: [0x7d,0x51,0x80,0xbe] + +s_barrier_join -1 +// GFX12: s_barrier_join -1 ; encoding: [0xc1,0x52,0x80,0xbe] + +s_barrier_join -2 +// GFX12: s_barrier_join -2 ; encoding: [0xc2,0x52,0x80,0xbe] + +s_barrier_join m0 +// GFX12: s_barrier_join m0 ; encoding: [0x7d,0x52,0x80,0xbe] + s_get_barrier_state s3, -1 // GFX12: s_get_barrier_state s3, -1 ; encoding: [0xc1,0x50,0x83,0xbe] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s index a58d68cb30cb1..75c3d6e6368df 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s @@ -76,6 +76,9 @@ s_barrier_wait 0xffff s_barrier_wait 1 // GFX12: encoding: [0x01,0x00,0x94,0xbf] +s_barrier_leave +// GFX12: encoding: [0x00,0x00,0x95,0xbf] + //===----------------------------------------------------------------------===// // s_waitcnt //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt index 1016d07df4995..c88fbc2fc6c4f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt @@ -719,6 +719,24 @@ 0x7d,0x4f,0x80,0xbe # GFX12: s_barrier_signal_isfirst m0 ; encoding: [0x7d,0x4f,0x80,0xbe] +0xc1,0x51,0x80,0xbe +# GFX12: s_barrier_init -1 ; encoding: [0xc1,0x51,0x80,0xbe] + +0xc2,0x51,0x80,0xbe +# GFX12: s_barrier_init -2 ; encoding: [0xc2,0x51,0x80,0xbe] + +0x7d,0x51,0x80,0xbe +# GFX12: s_barrier_init m0 ; encoding: [0x7d,0x51,0x80,0xbe] + +0xc1,0x52,0x80,0xbe +# GFX12: s_barrier_join -1 ; encoding: [0xc1,0x52,0x80,0xbe] + +0xc2,0x52,0x80,0xbe +# GFX12: s_barrier_join -2 ; encoding: [0xc2,0x52,0x80,0xbe] + +0x7d,0x52,0x80,0xbe +# GFX12: s_barrier_join m0 ; encoding: [0x7d,0x52,0x80,0xbe] + 0xc1,0x50,0x83,0xbe # GFX12: s_get_barrier_state s3, -1 ; encoding: [0xc1,0x50,0x83,0xbe] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt index f8aef72678623..1e40ea27c47d3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt @@ -68,6 +68,9 @@ # GFX12: s_barrier_wait 1 ; encoding: [0x01,0x00,0x94,0xbf] 0x01,0x00,0x94,0xbf +# GFX12: s_barrier_leave ; encoding: [0x00,0x00,0x95,0xbf] +0x00,0x00,0x95,0xbf + # GFX12: s_branch 0 ; encoding: [0x00,0x00,0xa0,0xbf] 0x00,0x00,0xa0,0xbf _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
