https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/107659
Update the GPU to NVVM lowerings to correctly propagate range information on IDs and dimension queries, etiher from known_{block,grid}_size attributes or from `upperBound` annotations on the operations themselves. >From f50dcd32b4ce02dc5046f8a3df3628b4b2096030 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak <krzysztof.drewn...@amd.com> Date: Fri, 6 Sep 2024 23:45:52 +0000 Subject: [PATCH] [mlir][GPU] Plumb range information through the NVVM lowterings Update the GPU to NVVM lowerings to correctly propagate range information on IDs and dimension queries, etiher from known_{block,grid}_size attributes or from `upperBound` annotations on the operations themselves. --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 283 +++++++++--------- .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 50 +++- .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp | 1 + .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir | 18 +- mlir/test/Target/LLVMIR/Import/nvvmir.ll | 3 + mlir/test/Target/LLVMIR/nvvmir.mlir | 7 +- 6 files changed, 207 insertions(+), 155 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 709dd922b8fa2f..66ac9f289d233b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -86,8 +86,8 @@ class NVVM_Op<string mnemonic, list<Trait> traits = []> : LLVM_OpBase<NVVM_Dialect, mnemonic, traits> { } -/// Base class that defines BasicPtxBuilderOpInterface. -class NVVM_PTXBuilder_Op<string mnemonic, +/// Base class that defines BasicPtxBuilderOpInterface. +class NVVM_PTXBuilder_Op<string mnemonic, list<Trait> traits = [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]> : LLVM_OpBase<NVVM_Dialect, mnemonic, traits> { } @@ -123,52 +123,67 @@ class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> : let assemblyFormat = "attr-dict `:` type($res)"; } +class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> : + NVVM_SpecialRegisterOp<mnemonic, traits> { + let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range); + let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)"; + let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda; + let mlirBuilder = baseMlirBuilder # importRangeRetAttrCode # baseMlirBuilderCoda; + + // Backwards-compatibility builder for an unspecified range. + let builders = [ + OpBuilder<(ins "Type":$resultType), [{ + build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{}); + }]> + ]; +} + //===----------------------------------------------------------------------===// // Lane index and range -def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">; -def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">; +def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">; +def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">; //===----------------------------------------------------------------------===// // Thread index and range -def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">; -def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">; -def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">; -def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">; -def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">; -def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">; +def NVVM_ThreadIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">; +def NVVM_ThreadIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">; +def NVVM_ThreadIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">; +def NVVM_BlockDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">; +def NVVM_BlockDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">; +def NVVM_BlockDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">; //===----------------------------------------------------------------------===// // Block index and range -def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">; -def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">; -def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">; -def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">; -def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">; -def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">; +def NVVM_BlockIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">; +def NVVM_BlockIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">; +def NVVM_BlockIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">; +def NVVM_GridDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">; +def NVVM_GridDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">; +def NVVM_GridDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">; //===----------------------------------------------------------------------===// // CTA Cluster index and range -def NVVM_ClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.x">; -def NVVM_ClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.y">; -def NVVM_ClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.z">; -def NVVM_ClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.x">; -def NVVM_ClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.y">; -def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">; +def NVVM_ClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x">; +def NVVM_ClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">; +def NVVM_ClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">; +def NVVM_ClusterDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">; +def NVVM_ClusterDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">; +def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">; //===----------------------------------------------------------------------===// // CTA index and range within Cluster -def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">; -def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">; -def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">; -def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">; -def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">; -def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">; +def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x">; +def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y">; +def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z">; +def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x">; +def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y">; +def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">; //===----------------------------------------------------------------------===// // CTA index and across Cluster dimensions -def NVVM_ClusterId : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctarank">; -def NVVM_ClusterDim : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctarank">; +def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank">; +def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">; //===----------------------------------------------------------------------===// // Clock registers @@ -197,11 +212,11 @@ def ReduxKindMin : I32EnumAttrCase<"MIN", 4, "min">; def ReduxKindOr : I32EnumAttrCase<"OR", 5, "or">; def ReduxKindUmax : I32EnumAttrCase<"UMAX", 6, "umax">; def ReduxKindUmin : I32EnumAttrCase<"UMIN", 7, "umin">; -def ReduxKindXor : I32EnumAttrCase<"XOR", 8, "xor">; +def ReduxKindXor : I32EnumAttrCase<"XOR", 8, "xor">; /// Enum attribute of the different kinds. def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind", - [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr, + [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr, ReduxKindUmax, ReduxKindUmin, ReduxKindXor]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; @@ -221,7 +236,7 @@ def NVVM_ReduxOp : }]; let assemblyFormat = [{ $kind $val `,` $mask_and_clamp attr-dict `:` type($val) `->` type($res) - }]; + }]; } //===----------------------------------------------------------------------===// @@ -308,7 +323,7 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete. let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; } -def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">, +def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">, Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> { let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; let extraClassDefinition = [{ @@ -316,16 +331,16 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t }]; } -def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">, - Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> { +def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">, + Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> { let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); } }]; } -def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">, - Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> { +def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">, + Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> { let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { @@ -338,13 +353,13 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity" "bra.uni LAB_WAIT; \n\t" "DONE: \n\t" "}" - ); + ); } }]; } -def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">, - Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> { +def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">, + Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> { let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { @@ -357,7 +372,7 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p "bra.uni LAB_WAIT; \n\t" "DONE: \n\t" "}" - ); + ); } }]; } @@ -392,7 +407,7 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { } def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { - let arguments = (ins + let arguments = (ins Optional<I32>:$barrierId, Optional<I32>:$numberOfThreads); string llvmBuilder = [{ @@ -401,7 +416,7 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { {$barrierId, $numberOfThreads}); } else if($barrierId) { createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_n, - {$barrierId}); + {$barrierId}); } else { createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0); } @@ -410,27 +425,27 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict"; } -def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> +def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> { let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads); let description = [{ - Thread that executes this op announces their arrival at the barrier with + Thread that executes this op announces their arrival at the barrier with given id and continue their execution. - The default barrier id is 0 that is similar to `nvvm.barrier` Op. When - `barrierId` is not present, the default barrier id is used. + The default barrier id is 0 that is similar to `nvvm.barrier` Op. When + `barrierId` is not present, the default barrier id is used. [For more information, see PTX ISA] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar) }]; - + let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { std::string ptx = "bar.arrive "; - if (getBarrierId()) { ptx += "%0, %1;"; } + if (getBarrierId()) { ptx += "%0, %1;"; } else { ptx += "0, %0;"; } return ptx; } @@ -553,7 +568,7 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">, [For more information, see PTX ISA] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) }]; - + let assemblyFormat = "attr-dict"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { @@ -671,9 +686,9 @@ def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> { [For more information, see PTX ISA] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) }]; - + let assemblyFormat = "attr-dict"; - let extraClassDefinition = [{ + let extraClassDefinition = [{ std::string $cppClass::getPtx() { return std::string("fence.mbarrier_init.release.cluster;"); } @@ -749,13 +764,13 @@ def NVVM_SyncWarpOp : } -def NVVM_ElectSyncOp : NVVM_Op<"elect.sync", +def NVVM_ElectSyncOp : NVVM_Op<"elect.sync", [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]> -{ +{ let results = (outs I1:$pred); - let assemblyFormat = "attr-dict `->` type(results)"; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { + let assemblyFormat = "attr-dict `->` type(results)"; + let extraClassDefinition = [{ + std::string $cppClass::getPtx() { return std::string( "{ \n" ".reg .u32 rx; \n" @@ -764,7 +779,7 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync", " elect.sync rx | px, 0xFFFFFFFF;\n" "@px mov.pred %0, 1; \n" "}\n" - ); + ); } }]; } @@ -776,16 +791,16 @@ def LoadCacheModifierLU : I32EnumAttrCase<"LU", 3, "lu">; def LoadCacheModifierCV : I32EnumAttrCase<"CV", 4, "cv">; /// Enum attribute of the different kinds. -def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind", +def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind", "NVVM load cache modifier kind", - [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS, + [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS, LoadCacheModifierLU, LoadCacheModifierCV]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; let description = [{ Enum attribute of the different kinds of cache operators for load instructions. - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62) }]; } @@ -811,7 +826,7 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">, id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16; else if($modifier == NVVM::LoadCacheModifierKind::CA) id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16; - else + else llvm_unreachable("unsupported cache modifier"); break; default: @@ -824,21 +839,21 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">, let extraClassDeclaration = [{ bool hasIntrinsic() { if(getCpSize()) return false; return true; } - void getAsmValues(RewriterBase &rewriter, + void getAsmValues(RewriterBase &rewriter, llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues) { asmValues.push_back({getDst(), PTXRegisterMod::Read}); asmValues.push_back({getSrc(), PTXRegisterMod::Read}); asmValues.push_back({makeConstantI32(rewriter, getSize()), PTXRegisterMod::Read}); asmValues.push_back({getCpSize(), PTXRegisterMod::Read}); - } + } }]; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { + let extraClassDefinition = [{ + std::string $cppClass::getPtx() { if(getModifier() == NVVM::LoadCacheModifierKind::CG) return std::string("cp.async.cg.shared.global [%0], [%1], %2, %3;\n"); if(getModifier() == NVVM::LoadCacheModifierKind::CA) return std::string("cp.async.ca.shared.global [%0], [%1], %2, %3;\n"); - llvm_unreachable("unsupported cache modifier"); + llvm_unreachable("unsupported cache modifier"); } }]; } @@ -1526,9 +1541,9 @@ def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">, let hasVerifier = 1; } -def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, - Arguments<(ins LLVM_PointerShared:$ptr, - Variadic<I32>:$sources, +def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, + Arguments<(ins LLVM_PointerShared:$ptr, + Variadic<I32>:$sources, MMALayoutAttr:$layout)> { let summary = "cooperative matrix store"; let description = [{ @@ -1537,7 +1552,7 @@ def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, [For more information, see PTX ISA] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix) }]; - + let assemblyFormat = "$ptr `,` $sources attr-dict `:` type(operands)"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { @@ -1757,25 +1772,25 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">, } def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">, - Arguments<(ins - ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group, + Arguments<(ins + ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group, OptionalAttr<UnitAttr>:$read)> { let assemblyFormat = "$group attr-dict"; let description = [{ Op waits for completion of the most recent bulk async-groups. The `$group` operand tells waiting has to be done until for $group or fewer - of the most recent bulk async-groups. If `$group` is 0, the op wait until + of the most recent bulk async-groups. If `$group` is 0, the op wait until all the most recent bulk async-groups have completed. - The `$read` indicates that the waiting has to be done until all the bulk - async operations in the specified bulk async-group have completed reading + The `$read` indicates that the waiting has to be done until all the bulk + async operations in the specified bulk async-group have completed reading from their source locations. [For more information, see PTX ISA] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group) }]; - + string llvmBuilder = [{ auto intId = op.getRead() ? llvm::Intrinsic::nvvm_cp_async_bulk_wait_group_read : @@ -1784,53 +1799,53 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">, }]; } -def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : - NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", - [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, +def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : + NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", + [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, AttrSizedOperandSegments]>, Arguments<(ins LLVM_PointerShared:$dstMem, LLVM_AnyPointer:$tmaDescriptor, Variadic<I32>:$coordinates, - LLVM_PointerShared:$mbar, + LLVM_PointerShared:$mbar, Variadic<I16>:$im2colOffsets, Optional<I16>:$multicastMask, Optional<I64>:$l2CacheHint, PtxPredicate:$predicate)> { let description = [{ - Initiates an asynchronous copy operation on the tensor data from global - memory to shared memory. + Initiates an asynchronous copy operation on the tensor data from global + memory to shared memory. The Op operates has two load modes: - 1) Tiled Mode: It's the default mode. The source multi-dimensional tensor - layout is preserved at the destination. + 1) Tiled Mode: It's the default mode. The source multi-dimensional tensor + layout is preserved at the destination. 2) Im2col Mode: This mode is used when `im2colOffsets` operands are present. the elements in the Bounding Box of the source tensor are rearranged into - columns at the destination. In this mode, the tensor has to be at least - 3-dimensional. + columns at the destination. In this mode, the tensor has to be at least + 3-dimensional. The `multicastMask` operand is optional. When it is present, the Op copies data from global memory to shared memory of multiple CTAs in the cluster. - Operand `multicastMask` specifies the destination CTAs in the cluster such + Operand `multicastMask` specifies the destination CTAs in the cluster such that each bit position in the 16-bit `multicastMask` operand corresponds to - the `nvvm.read.ptx.sreg.ctaid` of the destination CTA. + the `nvvm.read.ptx.sreg.ctaid` of the destination CTA. - The `l2CacheHint` operand is optional, and it is used to specify cache + The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. - + [For more information, see PTX ISA] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) }]; - let assemblyFormat = [{ - $dstMem `,` - $tmaDescriptor `,` - $mbar `,` - `box` `[`$coordinates `]` + let assemblyFormat = [{ + $dstMem `,` + $tmaDescriptor `,` + $mbar `,` + `box` `[`$coordinates `]` (`im2col` `[` $im2colOffsets^ `]` )? (`multicast_mask` `=` $multicastMask^ )? (`l2_cache_hint` `=` $l2CacheHint^ )? - (`predicate` `=` $predicate^)? + (`predicate` `=` $predicate^)? attr-dict `:` type($dstMem) `,` type($tmaDescriptor) }]; @@ -1840,16 +1855,16 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : int dim = getCoordinates().size(); std::string ptx = "cp.async.bulk.tensor."; ptx += std::to_string(dim) + "d."; - ptx += "shared::cluster.global.mbarrier::complete_tx::bytes"; + ptx += "shared::cluster.global.mbarrier::complete_tx::bytes"; if(im2colDim) ptx += ".im2col"; - if(getMulticastMask()) ptx += ".multicast::cluster"; + if(getMulticastMask()) ptx += ".multicast::cluster"; if(getL2CacheHint()) ptx += ".L2::cache_hint"; - + auto preg = [](int r) { return "%" + std::to_string(r); }; // Build Registers ptx += " [%0], [%1, {"; - int r = 2; + int r = 2; for(int i = 0; i < dim; i++) ptx += preg(r+i) + ","; ptx.pop_back(); r += dim; ptx += "} ], [%" + std::to_string(r++) + "]"; @@ -1868,19 +1883,19 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : let hasVerifier = 1; } -def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp : - NVVM_Op<"cp.async.bulk.tensor.global.shared.cta", - [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, +def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp : + NVVM_Op<"cp.async.bulk.tensor.global.shared.cta", + [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, AttrSizedOperandSegments]>, Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, LLVM_PointerShared:$srcMem, Variadic<I32>:$coordinates, PtxPredicate:$predicate)> { - let assemblyFormat = [{ - $tmaDescriptor `,` - $srcMem `,` - `box` `[`$coordinates `]` - (`,` `predicate` `=` $predicate^)? + let assemblyFormat = [{ + $tmaDescriptor `,` + $srcMem `,` + `box` `[`$coordinates `]` + (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands) }]; let extraClassDefinition = [{ @@ -1905,7 +1920,7 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap", Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> { let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; let extraClassDefinition = [{ - std::string $cppClass::getPtx() { + std::string $cppClass::getPtx() { return std::string("prefetch.tensormap [%0];"); } }]; @@ -1918,9 +1933,9 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap", def NVVM_WgmmaFenceAlignedOp : NVVM_PTXBuilder_Op<"wgmma.fence.aligned"> { let arguments = (ins); let description = [{ - Enforce an ordering of register accesses between warpgroup level matrix - multiplication and other operations. - + Enforce an ordering of register accesses between warpgroup level matrix + multiplication and other operations. + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence) }]; let assemblyFormat = "attr-dict"; @@ -1934,7 +1949,7 @@ def NVVM_WgmmaGroupSyncAlignedOp : NVVM_PTXBuilder_Op<"wgmma.commit.group.sync.a let assemblyFormat = "attr-dict"; let description = [{ Commits all prior uncommitted warpgroup level matrix multiplication operations. - + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group) }]; let extraClassDefinition = [{ @@ -1947,7 +1962,7 @@ def NVVM_WgmmaWaitGroupSyncOp : NVVM_PTXBuilder_Op<"wgmma.wait.group.sync.aligne let assemblyFormat = "attr-dict $group"; let description = [{ Signal the completion of a preceding warpgroup operation. - + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group) }]; let extraClassDefinition = [{ @@ -1994,7 +2009,7 @@ def WGMMATypeS32 : I32EnumAttrCase<"s32", 9>; def WGMMATypes : I32EnumAttr<"WGMMATypes", "NVVM WGMMA types", [WGMMATypeF16, WGMMATypeTF32, WGMMATypeU8, WGMMATypeS8, - WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3, + WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3, WGMMATypeF8E5M2, WGMMATypeF32, WGMMATypeS32]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; @@ -2004,44 +2019,44 @@ def WGMMATypesAttr : EnumAttr<NVVM_Dialect, WGMMATypes, "wgmma_type"> { } -def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async", +def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async", [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, PredOpTrait<"input struct and result struct must be the same type", - TCresIsSameAsOpBase<0, 0>>,]> + TCresIsSameAsOpBase<0, 0>>,]> { let results = (outs LLVM_AnyStruct:$results); - let arguments = (ins + let arguments = (ins LLVM_AnyStruct:$inouts, - I64:$descriptorA, - I64:$descriptorB, + I64:$descriptorA, + I64:$descriptorB, NVVM_MMAShapeAttr:$shape, WGMMATypesAttr:$typeA, WGMMATypesAttr:$typeB, WGMMATypesAttr:$typeD, WGMMAScaleOutAttr:$scaleD, WGMMAScaleInAttr:$scaleA, - WGMMAScaleInAttr:$scaleB, + WGMMAScaleInAttr:$scaleB, MMALayoutAttr:$layoutA, MMALayoutAttr:$layoutB, OptionalAttr<MMAIntOverflowAttr>:$satfinite - ); - - let assemblyFormat = [{ + ); + + let assemblyFormat = [{ $descriptorA `,` $descriptorB `,` $inouts `,` $shape `,` `D` `[` $typeD `,` $scaleD (`,` $satfinite^)? `]` `,` - `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,` + `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,` `B` `[` $typeB `,` $scaleB `,` $layoutB `]` - attr-dict `:` + attr-dict `:` type($inouts) `->` type($results) }]; - + let description = [{ - The warpgroup (128 threads) level matrix multiply and accumulate operation + The warpgroup (128 threads) level matrix multiply and accumulate operation has either of the following forms, where matrix D is called accumulator: D = A * B + D D = A * B, where the input from accumulator D is disabled. - Supported shapes: + Supported shapes: ``` |--------------|--------------|------------|--------------|---------------| | | | | |f16+=e4m3*e4m3 | @@ -2089,14 +2104,14 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async", |--------------|--------------|------------|--------------|---------------| ``` - + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) }]; - + let hasVerifier = 1; let extraClassDeclaration = [{ - void getAsmValues(RewriterBase &rewriter, + void getAsmValues(RewriterBase &rewriter, llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues); }]; } diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 9b1be198f77a82..c93688fb04c3ff 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -209,7 +209,12 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> { ConversionPatternRewriter &rewriter) const override { auto loc = op->getLoc(); MLIRContext *context = rewriter.getContext(); - Value newOp = rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type()); + LLVM::ConstantRangeAttr bounds = nullptr; + if (std::optional<APInt> upperBound = op.getUpperBound()) + bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>( + 32, 0, upperBound->getZExtValue()); + Value newOp = + rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds); // Truncate or extend the result depending on the index bitwidth specified // by the LLVMTypeConverter options. const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); @@ -340,27 +345,40 @@ void mlir::populateGpuSubgroupReduceOpLoweringPattern( void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { + using gpu::index_lowering::IndexKind; + using gpu::index_lowering::IntrType; populateWithGenerated(patterns); patterns.add<GPUPrintfOpToVPrintfLowering>(converter); patterns.add< gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp, - NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>, + NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>( + converter, IndexKind::Block, IntrType::Id); + patterns.add< gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp, - NVVM::BlockDimYOp, NVVM::BlockDimZOp>, + NVVM::BlockDimYOp, NVVM::BlockDimZOp>>( + converter, IndexKind::Block, IntrType::Dim); + patterns.add< gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp, - NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>, - gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp, - NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>, - gpu::index_lowering::OpLowering< - gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp, - NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>, - gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp, - NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>, - gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp, - NVVM::BlockIdYOp, NVVM::BlockIdZOp>, - gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp, - NVVM::GridDimYOp, NVVM::GridDimZOp>, - GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter); + NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>( + converter, IndexKind::Other, IntrType::Id); + patterns.add<gpu::index_lowering::OpLowering< + gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp, + NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim); + patterns.add<gpu::index_lowering::OpLowering< + gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp, + NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>( + converter, IndexKind::Other, IntrType::Id); + patterns.add<gpu::index_lowering::OpLowering< + gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp, + NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim); + patterns.add<gpu::index_lowering::OpLowering< + gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>( + converter, IndexKind::Block, IntrType::Id); + patterns.add<gpu::index_lowering::OpLowering< + gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>( + converter, IndexKind::Grid, IntrType::Dim); + patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>( + converter); patterns.add<GPUDynamicSharedMemoryOpLowering>( converter, NVVM::kSharedMemoryAlignmentBit); diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp index 855abc12a909ef..bc830a77f3c580 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Target/LLVMIR/ModuleImport.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/IntrinsicsNVPTX.h" using namespace mlir; diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index 8f2ec289c9252c..ec93f131d5ec13 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -699,9 +699,21 @@ gpu.module @test_module_32 { } gpu.module @test_module_33 { -// CHECK-LABEL: func @kernel_with_block_size() -// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>} - gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} { +// CHECK-LABEL: func @kernel_with_block_size( +// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 4, 2>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 4, 2>} + gpu.func @kernel_with_block_size(%arg0: !llvm.ptr) kernel attributes {known_block_size = array<i32: 32, 4, 2>} { + // CHECK: = nvvm.read.ptx.sreg.tid.x range <0 : i32, 32 : i32> : i32 + %0 = gpu.thread_id x + // CHECK: = nvvm.read.ptx.sreg.tid.y range <0 : i32, 4 : i32> : i32 + %1 = gpu.thread_id y + // CHECK: = nvvm.read.ptx.sreg.tid.z range <0 : i32, 2 : i32> : i32 + %2 = gpu.thread_id z + + // Fake usage to prevent dead code elimination + %3 = arith.addi %0, %1 : index + %4 = arith.addi %3, %2 : index + %5 = arith.index_cast %4 : index to i64 + llvm.store %5, %arg0 : i64, !llvm.ptr gpu.return } } diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll index e4a8773e2dd806..131e9065b2d883 100644 --- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll +++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll @@ -58,6 +58,9 @@ define i32 @nvvm_special_regs() { %27 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank() ; CHECK: = nvvm.read.ptx.sreg.cluster.nctarank : i32 %28 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank() + + ; CHECK = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32 + %29 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %1 } diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 6e2787d121ae64..eaf3750958dfd5 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -62,7 +62,10 @@ llvm.func @nvvm_special_regs() -> i32 { %29 = nvvm.read.ptx.sreg.clock : i32 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64 %30 = nvvm.read.ptx.sreg.clock64 : i64 - + + // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %31 = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32 + llvm.return %1 : i32 } @@ -84,7 +87,7 @@ llvm.func @llvm_nvvm_barrier0() { // CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]]) llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) { // CHECK: call void @llvm.nvvm.barrier0() - nvvm.barrier + nvvm.barrier // CHECK: call void @llvm.nvvm.barrier.n(i32 %[[barId]]) nvvm.barrier id = %barID // CHECK: call void @llvm.nvvm.barrier(i32 %[[barId]], i32 %[[numThreads]]) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits