https://github.com/clementval created https://github.com/llvm/llvm-project/pull/113959
Kernel launch in CUF are converted to `gpu.launch_func`. When the kernel has `cluster_dims` specified these get carried over to the `gpu.launch_func` operation. This patch updates the special conversion of `gpu.launch_func` when cluster dims are present to the newly added entry point. >From 912b3e1d5e98a5d5bb4f1fc5eaa8e6ba3a42158c Mon Sep 17 00:00:00 2001 From: Valentin Clement <clement...@gmail.com> Date: Mon, 28 Oct 2024 09:52:27 -0700 Subject: [PATCH] [flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present --- .../Transforms/CUFGPUToLLVMConversion.cpp | 83 ++++++++++++------- flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir | 24 +++++- 2 files changed, 76 insertions(+), 31 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 5645ce6e6858c8..c64f35542a6e59 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -76,11 +76,6 @@ struct GPULaunchKernelConversion mlir::LogicalResult matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { - - if (op.hasClusterSize()) { - return mlir::failure(); - } - mlir::Location loc = op.getLoc(); auto *ctx = rewriter.getContext(); mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>(); @@ -107,37 +102,65 @@ struct GPULaunchKernelConversion rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName()); } - auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>( - RTNAME_STRING(CUFLaunchKernel)); - auto llvmIntPtrType = mlir::IntegerType::get( ctx, this->getTypeConverter()->getPointerBitwidth(0)); auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx); - auto funcTy = mlir::LLVM::LLVMFunctionType::get( - voidTy, - {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, - llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, - /*isVarArg=*/false); - - auto cufLaunchKernel = mlir::SymbolRefAttr::get( - mod.getContext(), RTNAME_STRING(CUFLaunchKernel)); - if (!funcOp) { - mlir::OpBuilder::InsertionGuard insertGuard(rewriter); - rewriter.setInsertionPointToStart(mod.getBody()); - auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>( - loc, RTNAME_STRING(CUFLaunchKernel), funcTy); - launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private); - } mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy); - rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>( - op, funcTy, cufLaunchKernel, - mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(), - adaptor.getGridSizeY(), adaptor.getGridSizeZ(), - adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), - adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs, - nullPtr}); + if (op.hasClusterSize()) { + auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>( + RTNAME_STRING(CUFLaunchClusterKernel)); + auto funcTy = mlir::LLVM::LLVMFunctionType::get( + voidTy, + {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, + llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, + llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, + /*isVarArg=*/false); + auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get( + mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel)); + if (!funcOp) { + mlir::OpBuilder::InsertionGuard insertGuard(rewriter); + rewriter.setInsertionPointToStart(mod.getBody()); + auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>( + loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy); + launchKernelFuncOp.setVisibility( + mlir::SymbolTable::Visibility::Private); + } + rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>( + op, funcTy, cufLaunchClusterKernel, + mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(), + adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(), + adaptor.getGridSizeX(), adaptor.getGridSizeY(), + adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), + adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), + dynamicMemorySize, kernelArgs, nullPtr}); + } else { + auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>( + RTNAME_STRING(CUFLaunchKernel)); + auto funcTy = mlir::LLVM::LLVMFunctionType::get( + voidTy, + {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, + llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, + /*isVarArg=*/false); + auto cufLaunchKernel = mlir::SymbolRefAttr::get( + mod.getContext(), RTNAME_STRING(CUFLaunchKernel)); + if (!funcOp) { + mlir::OpBuilder::InsertionGuard insertGuard(rewriter); + rewriter.setInsertionPointToStart(mod.getBody()); + auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>( + loc, RTNAME_STRING(CUFLaunchKernel), funcTy); + launchKernelFuncOp.setVisibility( + mlir::SymbolTable::Visibility::Private); + } + rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>( + op, funcTy, cufLaunchKernel, + mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(), + adaptor.getGridSizeY(), adaptor.getGridSizeZ(), + adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), + adaptor.getBlockSizeZ(), dynamicMemorySize, + kernelArgs, nullPtr}); + } return mlir::success(); } diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index f10bd82f978dc4..7fede7c6c17b78 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -1,4 +1,4 @@ -// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s +// RUN: fir-opt --split-input-file --cuf-gpu-convert-to-llvm %s | FileCheck %s module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (g...@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { llvm.func @_QMmod1Phost_sub() { @@ -102,3 +102,25 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr // CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}}) + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (g...@github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} + llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} { + llvm.return + } + llvm.func @_QQmain() attributes {fir.bindc_name = "test"} { + %0 = llvm.mlir.constant(1 : index) : i64 + %1 = llvm.mlir.constant(2 : index) : i64 + %2 = llvm.mlir.constant(0 : i32) : i32 + %3 = llvm.mlir.constant(10 : index) : i64 + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 + llvm.return + } + gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] +} + +// CHECK-LABEL: llvm.func @_QQmain() +// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 +// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}}) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits