Author: adeshcom14 Date: 2026-06-05T09:45:00+05:30 New Revision: 6f233ceb01347d37bd943dde84e81710ac24bfdf
URL: https://github.com/llvm/llvm-project/commit/6f233ceb01347d37bd943dde84e81710ac24bfdf DIFF: https://github.com/llvm/llvm-project/commit/6f233ceb01347d37bd943dde84e81710ac24bfdf.diff LOG: [AMDGPU] Track tensor load/store DMAs with asyncmark (#200775) Wire existing variants of the tensor load-to/store-from LDS intrinsics into the existing asyncmark/wait.asyncmark mechanism via TENSOR_CNT waitcnt counter. Fixes: LCOMPILER-1619 Added: Modified: clang/include/clang/Basic/BuiltinsAMDGPU.td clang/include/clang/Basic/BuiltinsAMDGPUDocs.td llvm/docs/AMDGPUAsyncOperations.rst llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.h llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index b15a36df6c08f..d8020bdcc8458 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -957,8 +957,14 @@ def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Ext def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">; def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">; -def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; -def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; +def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts"> { + let Documentation = [DocTensorLoadToLDS_GFX1250]; + let ArgNames = ["D0", "D1", "D2", "D3", "D4", "cpol"]; +} +def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts"> { + let Documentation = [DocTensorStoreFromLDS_GFX1250]; + let ArgNames = ["D0", "D1", "D2", "D3", "D4", "cpol"]; +} def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td index cb2f000fcf548..97ae239e96ad5 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td @@ -615,3 +615,35 @@ scale variant. matrix A or B data can be reused from a previous WMMA instruction. }]; } + +//===----------------------------------------------------------------------===// +// Tensor DMA Builtins +//===----------------------------------------------------------------------===// + +def DocCatTensorDMA : DocumentationCategory<"Tensor DMA Builtins"> { + let Content = [{ +Asynchronous tensor DMA transfers between global memory and LDS. Track +completion using ``TENSOR_CNT`` or *asyncmarks* (see +:ref:`amdgpu-async-operations`). + +``D0``..``D4`` are the five chunks of the hardware tensor descriptor. The +``_d2`` machine instruction is selected when ``D2`` and ``D3`` are zero- +initialized; otherwise ``_d4`` is used. ``D4`` is reserved and silently +ignored on gfx1250. ``cpol`` is a compile-time cache-policy bitfield +(``th`` in bits [0:2], scope in bits [3:4]). +}]; +} + +def DocTensorLoadToLDS_GFX1250 : Documentation { + let Category = DocCatTensorDMA; + let Content = [{ +Asynchronously copies a tensor from global memory into LDS. +}]; +} + +def DocTensorStoreFromLDS_GFX1250 : Documentation { + let Category = DocCatTensorDMA; + let Content = [{ +Asynchronously copies a tensor from LDS into global memory. +}]; +} diff --git a/llvm/docs/AMDGPUAsyncOperations.rst b/llvm/docs/AMDGPUAsyncOperations.rst index a51fe4ebb7e97..0b8ea0ae77174 100644 --- a/llvm/docs/AMDGPUAsyncOperations.rst +++ b/llvm/docs/AMDGPUAsyncOperations.rst @@ -50,6 +50,13 @@ memory and LDS memory. void @llvm.amdgcn.global.store.async.from.lds.type(ptr %dst, ptr %src) void @llvm.amdgcn.cluster.load.async.to.lds.type(ptr %dst, ptr %src) +**GFX1250 Tensor DMA Instructions** + +.. code-block:: llvm + + void @llvm.amdgcn.tensor.load.to.lds(...) + void @llvm.amdgcn.tensor.store.from.lds(...) + Asyncmark Operations --------------------- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp index 282eaba6586a7..df8d22fb5e3dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp @@ -35,6 +35,8 @@ StringLiteral getInstCounterName(InstCounterType T) { return "X_CNT"; case ASYNC_CNT: return "ASYNC_CNT"; + case TENSOR_CNT: + return "TENSOR_CNT"; case VA_VDST: return "VA_VDST"; case VM_VSRC: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h index 24dbcdf8cc475..093d8a45d207b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h @@ -30,6 +30,7 @@ enum InstCounterType { KM_CNT, // gfx12+ only. X_CNT, // gfx1250. ASYNC_CNT, // gfx1250. + TENSOR_CNT, // gfx1250. NUM_EXTENDED_INST_CNTS, VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only. VM_VSRC, // gfx12+ expert mode only. @@ -77,7 +78,8 @@ class Waitcnt { // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt, - unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc) + unsigned AsyncCnt, unsigned TensorCnt, unsigned VaVdst, + unsigned VmVsrc) : Waitcnt() { Cnt[LOAD_CNT] = LoadCnt; Cnt[DS_CNT] = DsCnt; @@ -88,6 +90,7 @@ class Waitcnt { Cnt[KM_CNT] = KmCnt; Cnt[X_CNT] = XCnt; Cnt[ASYNC_CNT] = AsyncCnt; + Cnt[TENSOR_CNT] = TensorCnt; Cnt[VA_VDST] = VaVdst; Cnt[VM_VSRC] = VmVsrc; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 0ce3807395839..7241c0db726ce 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -161,7 +161,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) { DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \ DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \ DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \ - DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ + DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ \ + DECL(TENSOR_ACCESS) /* access that uses TENSOR_CNT */ // clang-format off #define AMDGPU_EVENT_ENUM(Name) Name, @@ -221,7 +222,7 @@ static const unsigned AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, - AMDGPU::S_WAIT_ASYNCCNT}; + AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT}; // ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware // code but still need to be processed by this pass for async vmcnt tracking. @@ -425,8 +426,9 @@ class WaitcntGenerator { // Returns a new waitcnt with all counters except VScnt set to 0. If // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. - // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated - // when a call to @llvm.amdgcn.wait.asyncmark() is processed. + // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They + // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is + // processed. virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; virtual ~WaitcntGenerator() = default; @@ -446,6 +448,8 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator { WaitEventSet(), WaitEventSet(), WaitEventSet(), + WaitEventSet(), + WaitEventSet(), WaitEventSet()}; public: @@ -482,6 +486,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator { WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}), WaitEventSet({VMEM_GROUP, SMEM_GROUP}), WaitEventSet({ASYNC_ACCESS}), + WaitEventSet({TENSOR_ACCESS}), WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE, VGPR_XDL_WRITE}), WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})}; @@ -675,6 +680,8 @@ class SIInsertWaitcnts { bool shouldUpdateAsyncMark(const MachineInstr &MI, AMDGPU::InstCounterType T) const { + if (SIInstrInfo::usesTENSOR_CNT(MI)) + return T == AMDGPU::TENSOR_CNT; if (!isAsyncLdsDmaWrite(MI)) return false; if (SIInstrInfo::usesASYNC_CNT(MI)) @@ -1835,6 +1842,8 @@ counterTypeForInstr(unsigned Opcode) { return AMDGPU::X_CNT; case AMDGPU::S_WAIT_ASYNCCNT: return AMDGPU::ASYNC_CNT; + case AMDGPU::S_WAIT_TENSORCNT: + return AMDGPU::TENSOR_CNT; default: return {}; } @@ -2090,8 +2099,8 @@ AMDGPU::Waitcnt WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { unsigned ExpertVal = IsExpertMode ? 0 : ~0u; return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0, - ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal, - ExpertVal); + ~0u /* XCNT */, ~0u /* ASYNC_CNT */, + ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal); } /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and @@ -3057,9 +3066,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (SIInstrInfo::usesASYNC_CNT(Inst)) { ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst); } + } else if (SIInstrInfo::usesTENSOR_CNT(Inst)) { + ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst); } else if (Inst.isCall()) { - // Act as a wait on everything, but AsyncCnt is never included in such - // blanket waits. + // Act as a wait on everything, but AsyncCnt and TensorCnt are never + // included in such blanket waits. ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else if (TII.isVINTERP(Inst)) { @@ -3813,7 +3824,7 @@ bool SIInsertWaitcnts::run() { for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) { if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT || CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT || - CT == AMDGPU::ASYNC_CNT) + CT == AMDGPU::ASYNC_CNT || CT == AMDGPU::TENSOR_CNT) continue; if (!ST.hasImageInsts() && diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 20ab23df208f8..831aa9ebb8435 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1078,6 +1078,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT; } + static bool usesTENSOR_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT; + } + + bool usesTENSOR_CNT(uint32_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT; + } + // Most sopk treat the immediate as a signed 16-bit, however some // use it as unsigned. static bool sopkIsZext(unsigned Opcode) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll index a8faa4620befa..9f30255a07095 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll @@ -5,6 +5,9 @@ ; %D4 should be zero-initialized for gfx1250, which only supports 4 groups of tensor descriptor declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol) declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol) +declare void @llvm.amdgcn.asyncmark() +declare void @llvm.amdgcn.wait.asyncmark(i16) +declare void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 %offset, i32 %cpol) define amdgpu_ps void @tensor_load_to_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { ; GFX1250-LABEL: tensor_load_to_lds_d4: @@ -271,3 +274,248 @@ define amdgpu_ps void @tensor_store_from_lds_d5(<4 x i32> inreg %D0, <8 x i32> i call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 22) ret void } + +;======================================================================= +; Tensor load/store DMAs are asynchronous and tracked via TENSOR_CNT. +; Verify that they participate in the asyncmark / wait_asyncmark +; mechanism. +;======================================================================== + +define amdgpu_ps void @tensor_load_to_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { +; GFX1250-LABEL: tensor_load_to_lds_with_asyncmark: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.wait.asyncmark(i16 0) + ret void +} + +define amdgpu_ps void @tensor_store_from_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { +; GFX1250-LABEL: tensor_store_from_lds_with_asyncmark: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.wait.asyncmark(i16 0) + ret void +} + +; Two outstanding tensor loads tracked by separate asyncmarks. The first +; wait_asyncmark(1) drains down to one outstanding TENSOR_CNT entry, the +; second wait_asyncmark(0) drains the remaining one. +define amdgpu_ps void @tensor_load_to_lds_two_asyncmarks(<4 x i32> inreg %D0a, <8 x i32> inreg %D1a, <4 x i32> inreg %D0b, <8 x i32> inreg %D1b, ptr addrspace(3) %lds) { +; GFX1250-LABEL: tensor_load_to_lds_two_asyncmarks: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: tensor_load_to_lds s[12:15], s[16:23] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(1) +; GFX1250-NEXT: s_wait_tensorcnt 0x1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: ds_load_b32 v2, v0 offset:4 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0a, <8 x i32> %D1a, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0b, <8 x i32> %D1b, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.wait.asyncmark(i16 1) + %lds_v0 = load i32, ptr addrspace(3) %lds + + call void @llvm.amdgcn.wait.asyncmark(i16 0) + %lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1 + %lds_v1 = load i32, ptr addrspace(3) %lds_gep1 + + %sum = add i32 %lds_v0, %lds_v1 + store i32 %sum, ptr addrspace(3) %lds + + ret void +} + +;======================================================================= +; Mix ASYNC_CNT and TENSOR_CNT tracked operations under a single +; asyncmark, and verify that a wait.asyncmark drains both counters. +;======================================================================== + +define void @tensor_and_async_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1, ptr addrspace(1) %src, ptr addrspace(3) %dst) { +; GFX1250-LABEL: tensor_and_async_lds_with_asyncmark: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[16:23] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_asynccnt 0x0 +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 0, i32 0) + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.wait.asyncmark(i16 0) + ret void +} + +;======================================================================= +; Two diamond if/else regions, each picking either a tensor DMA or an +; async-LDS DMA, each followed by its own asyncmark. +;======================================================================== + +define void @tensor_or_async_lds_diamonds(i32 inreg %cond1, i32 inreg %cond2, <4 x i32> inreg %D0, <8 x i32> inreg %D1, ptr addrspace(1) %src, ptr addrspace(3) %dst) { +; GFX1250-SDAG-LABEL: tensor_or_async_lds_diamonds: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_mov_b32 s11, s25 +; GFX1250-SDAG-NEXT: s_mov_b32 s10, s24 +; GFX1250-SDAG-NEXT: s_mov_b32 s9, s23 +; GFX1250-SDAG-NEXT: s_mov_b32 s8, s22 +; GFX1250-SDAG-NEXT: s_mov_b32 s7, s21 +; GFX1250-SDAG-NEXT: s_mov_b32 s6, s20 +; GFX1250-SDAG-NEXT: s_mov_b32 s5, s19 +; GFX1250-SDAG-NEXT: s_mov_b32 s4, s18 +; GFX1250-SDAG-NEXT: s_mov_b32 s15, s17 +; GFX1250-SDAG-NEXT: s_mov_b32 s14, s16 +; GFX1250-SDAG-NEXT: s_mov_b32 s13, s3 +; GFX1250-SDAG-NEXT: s_mov_b32 s12, s2 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, -1 +; GFX1250-SDAG-NEXT: s_cbranch_scc1 .LBB14_6 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow1 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB14_3 +; GFX1250-SDAG-NEXT: .LBB14_2: ; %t1 +; GFX1250-SDAG-NEXT: tensor_load_to_lds s[12:15], s[4:11] +; GFX1250-SDAG-NEXT: ; asyncmark +; GFX1250-SDAG-NEXT: .LBB14_3: ; %merge1 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, -1 +; GFX1250-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 +; GFX1250-SDAG-NEXT: ; %bb.4: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB14_8 +; GFX1250-SDAG-NEXT: .LBB14_5: ; %merge2 +; GFX1250-SDAG-NEXT: ; wait_asyncmark(1) +; GFX1250-SDAG-NEXT: s_wait_asynccnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB14_6: ; %g1 +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off +; GFX1250-SDAG-NEXT: ; asyncmark +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB14_2 +; GFX1250-SDAG-NEXT: s_branch .LBB14_3 +; GFX1250-SDAG-NEXT: .LBB14_7: ; %g2 +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off +; GFX1250-SDAG-NEXT: ; asyncmark +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB14_5 +; GFX1250-SDAG-NEXT: .LBB14_8: ; %t2 +; GFX1250-SDAG-NEXT: tensor_load_to_lds s[12:15], s[4:11] +; GFX1250-SDAG-NEXT: ; asyncmark +; GFX1250-SDAG-NEXT: ; wait_asyncmark(1) +; GFX1250-SDAG-NEXT: s_wait_asynccnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_tensorcnt 0x1 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: tensor_or_async_lds_diamonds: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s12, s2 +; GFX1250-GISEL-NEXT: s_mov_b32 s13, s3 +; GFX1250-GISEL-NEXT: s_mov_b32 s14, s16 +; GFX1250-GISEL-NEXT: s_mov_b32 s15, s17 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, s18 +; GFX1250-GISEL-NEXT: s_mov_b32 s5, s19 +; GFX1250-GISEL-NEXT: s_mov_b32 s6, s20 +; GFX1250-GISEL-NEXT: s_mov_b32 s7, s21 +; GFX1250-GISEL-NEXT: s_mov_b32 s8, s22 +; GFX1250-GISEL-NEXT: s_mov_b32 s9, s23 +; GFX1250-GISEL-NEXT: s_mov_b32 s10, s24 +; GFX1250-GISEL-NEXT: s_mov_b32 s11, s25 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB14_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %g1 +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1250-GISEL-NEXT: ; asyncmark +; GFX1250-GISEL-NEXT: .LBB14_2: ; %Flow1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s0, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB14_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %t1 +; GFX1250-GISEL-NEXT: tensor_load_to_lds s[12:15], s[4:11] +; GFX1250-GISEL-NEXT: ; asyncmark +; GFX1250-GISEL-NEXT: .LBB14_4: ; %merge1 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB14_6 +; GFX1250-GISEL-NEXT: ; %bb.5: ; %g2 +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1250-GISEL-NEXT: ; asyncmark +; GFX1250-GISEL-NEXT: .LBB14_6: ; %Flow +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s0, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX1250-GISEL-NEXT: ; %bb.7: ; %t2 +; GFX1250-GISEL-NEXT: tensor_load_to_lds s[12:15], s[4:11] +; GFX1250-GISEL-NEXT: ; asyncmark +; GFX1250-GISEL-NEXT: .LBB14_8: ; %merge2 +; GFX1250-GISEL-NEXT: ; wait_asyncmark(1) +; GFX1250-GISEL-NEXT: s_wait_asynccnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +entry: + %c1 = icmp ne i32 %cond1, 0 + br i1 %c1, label %t1, label %g1 + +t1: + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + br label %merge1 + +g1: + call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + br label %merge1 + +merge1: + %c2 = icmp ne i32 %cond2, 0 + br i1 %c2, label %t2, label %g2 + +t2: + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + br label %merge2 + +g2: + call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + br label %merge2 + +merge2: + call void @llvm.amdgcn.wait.asyncmark(i16 1) + ret void +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
