Valentin Clement =?utf-8?b?KOODkOODrOODsw=?Message-ID: In-Reply-To: <llvm.org/llvm/llvm-project/pull/131...@github.com>
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/131395 >From bd481839b595df26ec54ecf75ecdedef5425c0dd Mon Sep 17 00:00:00 2001 From: Valentin Clement <clement...@gmail.com> Date: Fri, 14 Mar 2025 14:19:08 -0700 Subject: [PATCH 1/2] [flang][cuda] Compute offset on cuf.shared_memory ops --- .../flang/Optimizer/Builder/CUFCommon.h | 1 + .../flang/Optimizer/Transforms/Passes.h | 1 + .../flang/Optimizer/Transforms/Passes.td | 13 ++ flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 + .../CUFComputeSharedMemoryOffsetsAndSize.cpp | 126 ++++++++++++++++++ flang/test/Fir/CUDA/cuda-shared-offset.mlir | 56 ++++++++ 6 files changed, 198 insertions(+) create mode 100644 flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp create mode 100644 flang/test/Fir/CUDA/cuda-shared-offset.mlir diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index e3c7b5098b83f..65b9cce1d2021 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -14,6 +14,7 @@ #include "mlir/IR/BuiltinOps.h" static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod"; +static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem"; namespace fir { class FirOpBuilder; diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 406fedf220d26..6dbabd523f88a 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -43,6 +43,7 @@ namespace fir { #define GEN_PASS_DECL_CUFDEVICEGLOBAL #define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION #define GEN_PASS_DECL_CUFOPCONVERSION +#define GEN_PASS_DECL_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE #define GEN_PASS_DECL_EXTERNALNAMECONVERSION #define GEN_PASS_DECL_MEMREFDATAFLOWOPT #define GEN_PASS_DECL_SIMPLIFYINTRINSICS diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index e5c17cf7d8881..fbab435887b8a 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -453,6 +453,19 @@ def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> { ]; } +def CUFComputeSharedMemoryOffsetsAndSize + : Pass<"cuf-compute-shared-memory", "mlir::ModuleOp"> { + let summary = "Create the shared memory global variable and set offsets"; + + let description = [{ + Compute the size and alignment of the shared memory global and materialize + it. Compute the offset of each cuf.shared_memory operation according to + the global and set it. + }]; + + let dependentDialects = ["fir::FIROpsDialect"]; +} + def SetRuntimeCallAttributes : Pass<"set-runtime-call-attrs", "mlir::func::FuncOp"> { let summary = "Set Fortran runtime fir.call attributes targeting LLVM IR"; diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 6e8666759ab83..ca08e4607e019 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -13,6 +13,7 @@ add_flang_library(FIRTransforms CUFDeviceGlobal.cpp CUFOpConversion.cpp CUFGPUToLLVMConversion.cpp + CUFComputeSharedMemoryOffsetsAndSize.cpp ArrayValueCopy.cpp ExternalNameConversion.cpp MemoryUtils.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp new file mode 100644 index 0000000000000..1881ae72ee721 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp @@ -0,0 +1,126 @@ +//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/RTBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/CodeGen/Target.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" +#include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "flang/Optimizer/Dialect/FIRAttr.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Support/DataLayout.h" +#include "flang/Runtime/CUDA/registration.h" +#include "flang/Runtime/entry-names.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/SmallVector.h" + +namespace fir { +#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +using namespace Fortran::runtime::cuda; + +namespace { + +struct CUFComputeSharedMemoryOffsetsAndSize + : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase< + CUFComputeSharedMemoryOffsetsAndSize> { + + void runOnOperation() override { + mlir::ModuleOp mod = getOperation(); + mlir::SymbolTable symTab(mod); + mlir::OpBuilder opBuilder{mod.getBodyRegion()}; + fir::FirOpBuilder builder(opBuilder, mod); + fir::KindMapping kindMap{fir::getKindMapping(mod)}; + std::optional<mlir::DataLayout> dl = + fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false); + if (!dl) { + mlir::emitError(mod.getLoc(), + "data layout attribute is required to perform " + + getName() + "pass"); + } + + auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab); + mlir::Type i8Ty = builder.getI8Type(); + for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) { + unsigned nbDynamicSharedVariables = 0; + unsigned nbStaticSharedVariables = 0; + uint64_t sharedMemSize = 0; + unsigned short alignment = 0; + + // Go over each shared memory operation and compute their start offset and + // the size and alignment of the global to be generated if all variables + // are static. If this is dynamic shared memory, then only the alignment + // is computed. + for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) { + if (fir::hasDynamicSize(sharedOp.getInType())) { + mlir::Type ty = sharedOp.getInType(); + // getTypeSizeAndAlignmentOrCrash will crash trying to compute the + // size of an array with dynamic size. Just get the alignment to + // create the global. + if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) + ty = seqTy.getEleTy(); + unsigned short align = dl->getTypeABIAlignment(ty); + ++nbDynamicSharedVariables; + sharedOp.setOffset(0); + alignment = std::max(alignment, align); + continue; + } + auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( + sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap); + ++nbStaticSharedVariables; + sharedOp.setOffset(llvm::alignTo(sharedMemSize, align)); + sharedMemSize = + llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align); + alignment = std::max(alignment, align); + } + if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0) + mlir::emitError( + funcOp.getLoc(), + "static and dynamic shared variables in a single kernel"); + + mlir::DenseElementsAttr init = {}; + if (sharedMemSize > 0) { + auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty); + mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0); + init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); + } + + // Create the shared memory global where each shared variables will point + // to. + auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty); + std::string sharedMemGlobalName = + (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str(); + mlir::StringAttr linkage = builder.createInternalLinkage(); + builder.setInsertionPointToEnd(gpuMod.getBody()); + llvm::SmallVector<mlir::NamedAttribute> attrs; + auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), + gpuMod.getContext()); + attrs.push_back(mlir::NamedAttribute( + fir::GlobalOp::getDataAttrAttrName(globalOpName), + cuf::DataAttributeAttr::get(gpuMod.getContext(), + cuf::DataAttribute::Shared))); + auto sharedMem = builder.create<fir::GlobalOp>( + funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType, + init, linkage, attrs); + sharedMem.setAlignment(alignment); + } + } +}; + +} // end anonymous namespace diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir new file mode 100644 index 0000000000000..b3ea7dfc89cc7 --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir @@ -0,0 +1,56 @@ +// RUN: fir-opt --split-input-file --cuf-compute-shared-memory %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + gpu.module @cuda_device_mod { + gpu.func @_QPdynshared() kernel { + %c-1 = arith.constant -1 : index + %6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>> + %7 = fir.shape %c-1 : (index) -> !fir.shape<1> + %8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>> + gpu.return + } + } +} + +// CHECK-LABEL: gpu.module @cuda_device_mod +// CHECK: gpu.func @_QPdynshared() +// CHECK: %{{.*}} = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>> +// CHECK: gpu.return +// CHECK: } +// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8> + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + gpu.module @cuda_device_mod { + gpu.func @_QPshared_static() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} { + %0 = cuf.shared_memory i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32> + %1 = fir.declare %0 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEa"} : (!fir.ref<i32>) -> !fir.ref<i32> + %2 = cuf.shared_memory i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32> + %3 = fir.declare %2 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEb"} : (!fir.ref<i32>) -> !fir.ref<i32> + %8 = cuf.shared_memory i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32> + %9 = fir.declare %8 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEc"} : (!fir.ref<i32>) -> !fir.ref<i32> + %10 = cuf.shared_memory i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32> + %11 = fir.declare %10 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEd"} : (!fir.ref<i32>) -> !fir.ref<i32> + %12 = cuf.shared_memory i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64> + %13 = fir.declare %12 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEe"} : (!fir.ref<i64>) -> !fir.ref<i64> + %16 = cuf.shared_memory f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32> + %17 = fir.declare %16 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEr"} : (!fir.ref<f32>) -> !fir.ref<f32> + gpu.return + } + } +} + +// CHECK-LABEL: gpu.module @cuda_device_mod +// CHECK: gpu.func @_QPshared_static() +// CHECK: cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32> +// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32> +// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32> +// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32> +// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64> +// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32> +// CHECK: gpu.return +// CHECK: } +// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8> +// CHECK: } +// CHECK: } >From 8ac97ea1b6f88e3ae0a3de4dcabfcacecd1a55ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clement...@gmail.com> Date: Fri, 14 Mar 2025 15:06:21 -0700 Subject: [PATCH 2/2] Update flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp --- .../Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp index 1881ae72ee721..5c6d1233c3ed3 100644 --- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp +++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp @@ -101,7 +101,7 @@ struct CUFComputeSharedMemoryOffsetsAndSize init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); } - // Create the shared memory global where each shared variables will point + // Create the shared memory global where each shared variable will point // to. auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty); std::string sharedMemGlobalName = _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits