https://github.com/adeshcom14 created https://github.com/llvm/llvm-project/pull/200775
Adds async variants of the tensor load-to/store-from LDS intrinsics and wires them into the existing asyncmark/wait.asyncmark mechanism via TENSOR_CNT waitcnt counter. Fixes: LCOMPILER-1619 >From ce9c0a38b057ce6318bfebde8a2858fc33f7decd Mon Sep 17 00:00:00 2001 From: Adesh Adikane <[email protected]> Date: Mon, 1 Jun 2026 10:12:19 +0000 Subject: [PATCH] [AMDGPU] Async Tensor Load/Store LDS --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 2 + ...iltins-amdgcn-gfx1250-tensor-load-store.cl | 66 ++++++++ .../builtins-amdgcn-error-gfx1250-param.cl | 2 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 29 +++- .../AMDGPU/AMDGPUInstructionSelector.cpp | 29 +++- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 8 +- llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h | 5 +- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 5 +- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 33 +++- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 + .../llvm.amdgcn.async.tensor.load.store.ll | 154 ++++++++++++++++++ .../AMDGPU/reg-coalescer-subreg-liveness.mir | 12 +- .../test/CodeGen/AMDGPU/sched-ldsdma-mask.mir | 48 +++--- 15 files changed, 356 insertions(+), 49 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.async.tensor.load.store.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index b15a36df6c08f..12c62ec3d0f04 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -959,6 +959,8 @@ def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int( def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; +def __builtin_amdgcn_tensor_load_async_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; +def __builtin_amdgcn_tensor_store_async_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl index 71b06e890ab9b..bf37f53a15734 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl @@ -76,3 +76,69 @@ void test_amdgcn_tensor_store_from_lds_d5(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8 { __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, 0); } + +//======================================================================= +// Async tensor load/store builtins. They lower to the same machine +// instructions as the sync variants but are tracked by the asyncmark +// mechanism. +//======================================================================== + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_async_to_lds_d4( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_load_async_to_lds_d4(v4u sg0, v8i sg1, v4i sg2, v4i sg3) +{ + __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_async_to_lds_d2( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_load_async_to_lds_d2(v4u sg0, v8i sg1) +{ + __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_async_from_lds_d4( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_store_async_from_lds_d4(v4u sg0, v8i sg1, v4i sg2, v4i sg3) +{ + __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_async_from_lds_d2( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_store_async_from_lds_d2(v4u sg0, v8i sg1) +{ + __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_async_to_lds_d5( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_load_async_to_lds_d5(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4) +{ + __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, sg2, sg3, sg4, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_async_from_lds_d5( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_store_async_from_lds_d5(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4) +{ + __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, sg2, sg3, sg4, 0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index c0d5a946f1e9f..1d867384bd514 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -189,6 +189,8 @@ void test_amdgcn_tensor_load_store(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4, { __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}} __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}} + __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_async_to_lds' must be a constant integer}} + __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_async_from_lds' must be a constant integer}} } void test_prefetch(generic void *fptr, global void *gptr, int cpol) { diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2fd5285dfc330..85dffb66a0029 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -4222,6 +4222,8 @@ class AMDGPUTensorLoadStore: def int_amdgcn_tensor_load_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore; def int_amdgcn_tensor_store_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore; +def int_amdgcn_tensor_load_async_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_async_to_lds">, AMDGPUTensorLoadStore; +def int_amdgcn_tensor_store_async_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_async_from_lds">, AMDGPUTensorLoadStore; class AMDGPUClusterLoad<LLVMType ptr_ty>: Intrinsic< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ecf8d957fc80f..66001106c5995 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3040,11 +3040,32 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) { } void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) { - bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds; + bool IsLoad, IsAsync; + switch (IntrID) { + case Intrinsic::amdgcn_tensor_load_to_lds: + IsLoad = true; + IsAsync = false; + break; + case Intrinsic::amdgcn_tensor_store_from_lds: + IsLoad = false; + IsAsync = false; + break; + case Intrinsic::amdgcn_tensor_load_async_to_lds: + IsLoad = true; + IsAsync = true; + break; + case Intrinsic::amdgcn_tensor_store_async_from_lds: + IsLoad = false; + IsAsync = true; + break; + default: + llvm_unreachable("not a tensor load/store intrinsic"); + } + unsigned Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4; - SmallVector<SDValue, 7> TensorOps; + SmallVector<SDValue, 8> TensorOps; // First two groups TensorOps.push_back(N->getOperand(2)); // D# group 0 TensorOps.push_back(N->getOperand(3)); // D# group 1 @@ -3065,6 +3086,8 @@ void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) { // for now because all existing targets only support up to 4 groups. TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128 TensorOps.push_back(N->getOperand(7)); // cache policy + TensorOps.push_back( + CurDAG->getTargetConstant(IsAsync, SDLoc(N), MVT::i1)); // IsAsync TensorOps.push_back(N->getOperand(0)); // chain (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps); @@ -3354,6 +3377,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { return; case Intrinsic::amdgcn_tensor_load_to_lds: case Intrinsic::amdgcn_tensor_store_from_lds: + case Intrinsic::amdgcn_tensor_load_async_to_lds: + case Intrinsic::amdgcn_tensor_store_async_from_lds: SelectTensorLoadStore(N, IntrID); return; default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 463b8c40350b2..47407721ca325 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2470,6 +2470,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectGlobalLoadLds(I); case Intrinsic::amdgcn_tensor_load_to_lds: case Intrinsic::amdgcn_tensor_store_from_lds: + case Intrinsic::amdgcn_tensor_load_async_to_lds: + case Intrinsic::amdgcn_tensor_store_async_from_lds: return selectTensorLoadStore(I, IntrinsicID); case Intrinsic::amdgcn_asyncmark: case Intrinsic::amdgcn_wait_asyncmark: @@ -3872,7 +3874,27 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const { - bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds; + bool IsLoad, IsAsync; + switch (IID) { + case Intrinsic::amdgcn_tensor_load_to_lds: + IsLoad = true; + IsAsync = false; + break; + case Intrinsic::amdgcn_tensor_store_from_lds: + IsLoad = false; + IsAsync = false; + break; + case Intrinsic::amdgcn_tensor_load_async_to_lds: + IsLoad = true; + IsAsync = true; + break; + case Intrinsic::amdgcn_tensor_store_async_from_lds: + IsLoad = false; + IsAsync = true; + break; + default: + llvm_unreachable("not a tensor load/store intrinsic"); + } unsigned Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4; int NumGroups = 4; @@ -3904,8 +3926,9 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI, .add(MI.getOperand(4)); // D# group 3 } - MIB.addImm(0) // r128 - .add(MI.getOperand(6)); // cpol + MIB.addImm(0) // r128 + .add(MI.getOperand(6)) // cpol + .addImm(IsAsync ? 1 : 0); // IsAsync MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index a24df782cf28a..62c7991dec96f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3382,7 +3382,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_tensor_load_to_lds: - case Intrinsic::amdgcn_tensor_store_from_lds: { + case Intrinsic::amdgcn_tensor_store_from_lds: + case Intrinsic::amdgcn_tensor_load_async_to_lds: + case Intrinsic::amdgcn_tensor_store_async_from_lds: { constrainOpWithReadfirstlane(B, MI, 1); constrainOpWithReadfirstlane(B, MI, 2); constrainOpWithReadfirstlane(B, MI, 3); @@ -5647,7 +5649,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_pops_exiting_wave_id: return getDefaultMappingSOP(MI); case Intrinsic::amdgcn_tensor_load_to_lds: - case Intrinsic::amdgcn_tensor_store_from_lds: { + case Intrinsic::amdgcn_tensor_store_from_lds: + case Intrinsic::amdgcn_tensor_load_async_to_lds: + case Intrinsic::amdgcn_tensor_store_async_from_lds: { // Lie and claim everything is legal, even all operands need to be // SGPRs. applyMapping will have to deal with it with readfirstlane. for (unsigned I = 1; I < MI.getNumOperands(); ++I) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp index 282eaba6586a7..df8d22fb5e3dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp @@ -35,6 +35,8 @@ StringLiteral getInstCounterName(InstCounterType T) { return "X_CNT"; case ASYNC_CNT: return "ASYNC_CNT"; + case TENSOR_CNT: + return "TENSOR_CNT"; case VA_VDST: return "VA_VDST"; case VM_VSRC: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h index 24dbcdf8cc475..093d8a45d207b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h @@ -30,6 +30,7 @@ enum InstCounterType { KM_CNT, // gfx12+ only. X_CNT, // gfx1250. ASYNC_CNT, // gfx1250. + TENSOR_CNT, // gfx1250. NUM_EXTENDED_INST_CNTS, VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only. VM_VSRC, // gfx12+ expert mode only. @@ -77,7 +78,8 @@ class Waitcnt { // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt, - unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc) + unsigned AsyncCnt, unsigned TensorCnt, unsigned VaVdst, + unsigned VmVsrc) : Waitcnt() { Cnt[LOAD_CNT] = LoadCnt; Cnt[DS_CNT] = DsCnt; @@ -88,6 +90,7 @@ class Waitcnt { Cnt[KM_CNT] = KmCnt; Cnt[X_CNT] = XCnt; Cnt[ASYNC_CNT] = AsyncCnt; + Cnt[TENSOR_CNT] = TensorCnt; Cnt[VA_VDST] = VaVdst; Cnt[VM_VSRC] = VmVsrc; } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 0f31697f15688..34efcd0cf4cde 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2182,9 +2182,10 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : let hasSideEffects = 0; bit UpTo2D = _UpTo2D; - let InOperandList = !if(UpTo2D, (ins SReg_128_XNULL:$vaddr0, SReg_256_XNULL:$vaddr1, R128A16:$r128, CPol:$cpol), + let InOperandList = !if(UpTo2D, (ins SReg_128_XNULL:$vaddr0, SReg_256_XNULL:$vaddr1, + R128A16:$r128, CPol:$cpol, i1imm:$IsAsync), (ins SReg_128_XNULL:$vaddr0, SReg_256_XNULL:$vaddr1, SReg_128_XNULL:$vaddr2, - SReg_128_XNULL:$vaddr3, R128A16:$r128, CPol:$cpol)); + SReg_128_XNULL:$vaddr3, R128A16:$r128, CPol:$cpol, i1imm:$IsAsync)); string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol"; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 0ce3807395839..db2f55552cb96 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -161,7 +161,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) { DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \ DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \ DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \ - DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ + DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ \ + DECL(TENSOR_ACCESS) /* access that uses TENSOR_CNT */ // clang-format off #define AMDGPU_EVENT_ENUM(Name) Name, @@ -221,7 +222,7 @@ static const unsigned AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, - AMDGPU::S_WAIT_ASYNCCNT}; + AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT}; // ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware // code but still need to be processed by this pass for async vmcnt tracking. @@ -425,8 +426,9 @@ class WaitcntGenerator { // Returns a new waitcnt with all counters except VScnt set to 0. If // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. - // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated - // when a call to @llvm.amdgcn.wait.asyncmark() is processed. + // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They + // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is + // processed. virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; virtual ~WaitcntGenerator() = default; @@ -446,6 +448,8 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator { WaitEventSet(), WaitEventSet(), WaitEventSet(), + WaitEventSet(), + WaitEventSet(), WaitEventSet()}; public: @@ -482,6 +486,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator { WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}), WaitEventSet({VMEM_GROUP, SMEM_GROUP}), WaitEventSet({ASYNC_ACCESS}), + WaitEventSet({TENSOR_ACCESS}), WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE, VGPR_XDL_WRITE}), WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})}; @@ -673,8 +678,14 @@ class SIInsertWaitcnts { return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI); } + bool isAsyncTensorDMA(const MachineInstr &MI) const { + return SIInstrInfo::usesTENSOR_CNT(MI) && isAsync(MI); + } + bool shouldUpdateAsyncMark(const MachineInstr &MI, AMDGPU::InstCounterType T) const { + if (isAsyncTensorDMA(MI)) + return T == AMDGPU::TENSOR_CNT; if (!isAsyncLdsDmaWrite(MI)) return false; if (SIInstrInfo::usesASYNC_CNT(MI)) @@ -1835,6 +1846,8 @@ counterTypeForInstr(unsigned Opcode) { return AMDGPU::X_CNT; case AMDGPU::S_WAIT_ASYNCCNT: return AMDGPU::ASYNC_CNT; + case AMDGPU::S_WAIT_TENSORCNT: + return AMDGPU::TENSOR_CNT; default: return {}; } @@ -2090,8 +2103,8 @@ AMDGPU::Waitcnt WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { unsigned ExpertVal = IsExpertMode ? 0 : ~0u; return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0, - ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal, - ExpertVal); + ~0u /* XCNT */, ~0u /* ASYNC_CNT */, + ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal); } /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and @@ -3057,9 +3070,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (SIInstrInfo::usesASYNC_CNT(Inst)) { ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst); } + } else if (isAsyncTensorDMA(Inst)) { + ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst); } else if (Inst.isCall()) { - // Act as a wait on everything, but AsyncCnt is never included in such - // blanket waits. + // Act as a wait on everything, but AsyncCnt and TensorCnt are never + // included in such blanket waits. ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else if (TII.isVINTERP(Inst)) { @@ -3813,7 +3828,7 @@ bool SIInsertWaitcnts::run() { for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) { if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT || CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT || - CT == AMDGPU::ASYNC_CNT) + CT == AMDGPU::ASYNC_CNT || CT == AMDGPU::TENSOR_CNT) continue; if (!ST.hasImageInsts() && diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index f5658021df954..7618910d095ba 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1078,6 +1078,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT; } + static bool usesTENSOR_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT; + } + + bool usesTENSOR_CNT(uint32_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT; + } + // Most sopk treat the immediate as a signed 16-bit, however some // use it as unsigned. static bool sopkIsZext(unsigned Opcode) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.async.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.async.tensor.load.store.ll new file mode 100644 index 0000000000000..d897d2bb5c396 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.async.tensor.load.store.ll @@ -0,0 +1,154 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s + +declare void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol) +declare void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol) + +define amdgpu_ps void @tensor_load_async_to_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { +; GFX1250-LABEL: tensor_load_async_to_lds_d4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> zeroinitializer, i32 0) + ret void +} + +define amdgpu_ps void @tensor_load_async_to_lds_d2(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { +; GFX1250-LABEL: tensor_load_async_to_lds_d2: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27) + ret void +} + +define amdgpu_ps void @tensor_store_async_from_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { +; GFX1250-LABEL: tensor_store_async_from_lds_d4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> zeroinitializer, i32 22) + ret void +} + +define amdgpu_ps void @tensor_store_async_from_lds_d2(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { +; GFX1250-LABEL: tensor_store_async_from_lds_d2: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11] +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + ret void +} + +;======================================================================= +; It is fine to pass 5 arguments as tensor descriptor, but the fifth one +; will be ignored silently by the CodeGen for gfx1250, which only +; supports D# up to 4 groups. +;======================================================================== + +define amdgpu_ps void @tensor_load_async_to_lds_d5(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3, <8 x i32> inreg %D4) { +; GFX1250-LABEL: tensor_load_async_to_lds_d5: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 0) + ret void +} + +define amdgpu_ps void @tensor_store_async_from_lds_d5(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3, <8 x i32> inreg %D4) { +; GFX1250-LABEL: tensor_store_async_from_lds_d5: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 22) + ret void +} + +;======================================================================= +; Async tensor load/store interleaved with asyncmark and wait_asyncmark. +;======================================================================== + +define amdgpu_ps void @tensor_load_async_to_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { +; GFX1250-LABEL: tensor_load_async_to_lds_with_asyncmark: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.wait.asyncmark(i16 0) + ret void +} + +define amdgpu_ps void @tensor_store_async_from_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { +; GFX1250-LABEL: tensor_store_async_from_lds_with_asyncmark: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: s_endpgm + call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.wait.asyncmark(i16 0) + ret void +} + + + +define amdgpu_ps void @tensor_load_async_to_lds_two_asyncmarks(<4 x i32> inreg %D0a, <8 x i32> inreg %D1a, +; GFX1250-LABEL: tensor_load_async_to_lds_two_asyncmarks: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: tensor_load_to_lds s[12:15], s[16:23] +; GFX1250-NEXT: ; asyncmark +; GFX1250-NEXT: ; wait_asyncmark(1) +; GFX1250-NEXT: s_wait_tensorcnt 0x1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: ; wait_asyncmark(0) +; GFX1250-NEXT: s_wait_tensorcnt 0x0 +; GFX1250-NEXT: ds_load_b32 v2, v0 offset:4 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + <4 x i32> inreg %D0b, <8 x i32> inreg %D1b, + ptr addrspace(3) %lds) { + call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0a, <8 x i32> %D1a, + <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, + <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> %D0b, <8 x i32> %D1b, + <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, + <8 x i32> zeroinitializer, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.wait.asyncmark(i16 1) + %lds_v0 = load i32, ptr addrspace(3) %lds + + call void @llvm.amdgcn.wait.asyncmark(i16 0) + %lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1 + %lds_v1 = load i32, ptr addrspace(3) %lds_gep1 + + %sum = add i32 %lds_v0, %lds_v1 + store i32 %sum, ptr addrspace(3) %lds + + ret void +} + +declare void @llvm.amdgcn.asyncmark() +declare void @llvm.amdgcn.wait.asyncmark(i16) diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir index 00c4ec981111e..f4d4a1b44aba6 100644 --- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir @@ -18,7 +18,7 @@ body: | ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; CHECK-NEXT: {{ $}} @@ -27,8 +27,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: $vcc_lo = COPY $exec_lo ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1 @@ -47,7 +47,7 @@ body: | undef %3.sub0:sgpr_128 = COPY %2 %4:sreg_32 = S_MOV_B32 0 undef %5.sub0:sgpr_256 = COPY %4 - TENSOR_LOAD_TO_LDS_d2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2 %3, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt %6:sgpr_128 = COPY killed %3 %6.sub1:sgpr_128 = COPY killed %1 %7:sreg_32 = COPY $exec_lo @@ -62,11 +62,11 @@ body: | %11.sub1:sgpr_128 = COPY killed %10 %11.sub2:sgpr_128 = COPY %2 %11.sub3:sgpr_128 = COPY %2 - TENSOR_LOAD_TO_LDS_d2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2 killed %11, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt %12:sreg_32 = COPY killed %9 %13:sgpr_128 = COPY %6 %13.sub2:sgpr_128 = COPY killed %12 - TENSOR_LOAD_TO_LDS_d2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2 killed %13, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt $vcc_lo = COPY %7 %8:sreg_32 = COPY %4 %9:sreg_32 = COPY %2 diff --git a/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir index 82358f80329cb..a37ac32507680 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir @@ -29,14 +29,14 @@ body: | ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF3]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF8]], [[DEF9]], implicit $exec - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF5]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0 ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0 @@ -56,12 +56,12 @@ body: | %21:vgpr_32 = IMPLICIT_DEF %22:vgpr_32 = IMPLICIT_DEF %23:vgpr_32 = IMPLICIT_DEF - TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %3, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %1, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec %31:vgpr_32 = V_ADD_U32_e32 %22, %23, implicit $exec %32:vgpr_32 = V_ADD_U32_e32 %30, %31, implicit $exec @@ -91,14 +91,14 @@ body: | ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF5]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[DEF8]], implicit $exec - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF1]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF3]], 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[DEF9]], implicit $exec ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2 @@ -119,12 +119,12 @@ body: | %21:vgpr_32 = IMPLICIT_DEF %22:vgpr_32 = IMPLICIT_DEF %23:vgpr_32 = IMPLICIT_DEF - TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %5, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %1, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %3, 0, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec %31:vgpr_32 = V_ADD_U32_e32 %30, %22, implicit $exec %32:vgpr_32 = V_ADD_U32_e32 %31, %23, implicit $exec _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
