================ @@ -0,0 +1,357 @@ +//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for CUDA code generation targeting the NVIDIA CUDA +// runtime library. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenCUDARuntime.h" +#include "CIRGenFunction.h" +#include "CIRGenModule.h" +#include "mlir/IR/Operation.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/Decl.h" +#include "clang/AST/GlobalDecl.h" +#include "clang/Basic/AddressSpaces.h" +#include "clang/Basic/Cuda.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" +#include "llvm/Support/Casting.h" + +using namespace clang; +using namespace clang::CIRGen; + +namespace { + +class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { +protected: + StringRef Prefix; + + // Map a device stub function to a symbol for identifying kernel in host + // code. For CUDA, the symbol for identifying the kernel is the same as the + // device stub function. For HIP, they are different. + llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles; + + // Map a kernel handle to the kernel stub. + llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs; + // Mangle context for device. + std::unique_ptr<MangleContext> deviceMC; + +private: + void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args); + mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc, + FunctionArgList &args); + mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override; + std::string addPrefixToName(StringRef funcName) const; + std::string addUnderscoredPrefixToName(StringRef funcName) const; + +public: + CIRGenNVCUDARuntime(CIRGenModule &cgm); + ~CIRGenNVCUDARuntime(); + + void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args) override; +}; + +} // namespace + +std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const { + return (Prefix + funcName).str(); +} + +std::string +CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const { + return ("__" + Prefix + funcName).str(); +} + +static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) { + // If the host and device have different C++ ABIs, mark it as the device + // mangle context so that the mangling needs to retrieve the additional + // device lambda mangling number instead of the regular host one. + if (cgm.getASTContext().getAuxTargetInfo() && + cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() && + cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { + return std::unique_ptr<MangleContext>( + cgm.getASTContext().createDeviceMangleContext( + *cgm.getASTContext().getAuxTargetInfo())); + } + + return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext( + cgm.getASTContext().getAuxTargetInfo())); +} + +CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm) + : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) { + if (cgm.getLangOpts().OffloadViaLLVM) + llvm_unreachable("NYI"); + else if (cgm.getLangOpts().HIP) + Prefix = "hip"; + else + Prefix = "cuda"; +} + +mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf, + mlir::Location loc, + FunctionArgList &args) { + auto &builder = cgm.getBuilder(); + + // Build void *args[] and populate with the addresses of kernel arguments. + auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size()); + mlir::Value kernelArgs = builder.createAlloca( + loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args", + CharUnits::fromQuantity(16)); + + mlir::Value kernelArgsDecayed = + builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs, + cir::PointerType::get(cgm.voidPtrTy)); + + for (auto [i, arg] : llvm::enumerate(args)) { + mlir::Value index = + builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i)); + mlir::Value storePos = + builder.createPtrStride(loc, kernelArgsDecayed, index); + + // Get the address of the argument and cast the store destination to match + // its pointer-to-pointer type. This is needed because upstream's + // createStore doesn't auto-bitcast like the incubator version. + mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer(); + mlir::Value storePosTyped = builder.createBitcast( + storePos, cir::PointerType::get(argAddr.getType())); + + builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped); + } + + return kernelArgsDecayed; +} + +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, + cir::FuncOp fn, + FunctionArgList &args) { + + // This requires arguments to be sent to kernels in a different way. + if (cgm.getLangOpts().OffloadViaLLVM) + cgm.errorNYI("Offload via LLVM"); + + auto &builder = cgm.getBuilder(); + auto loc = fn.getLoc(); + + // For [cuda|hip]LaunchKernel, we must add another layer of indirection + // to arguments. For example, for function `add(int a, float b)`, + // we need to pass it as `void *args[2] = { &a, &b }`. + mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args); + + // Lookup cudaLaunchKernel/hipLaunchKernel function. + // HIP kernel launching API name depends on -fgpu-default-stream option. For + // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', + // it is hipLaunchKernel_spt. + // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + // void **args, size_t sharedMem, + // cudaStream_t stream); + // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, + // dim3 blockDim, void **args, + // size_t sharedMem, hipStream_t stream); + TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl(); + DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl); + + // The default stream is usually stream 0 (the legacy default stream). + // For per-thread default stream, we need a different LaunchKernel function. + std::string kernelLaunchAPI = "LaunchKernel"; + if (cgm.getLangOpts().GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) + cgm.errorNYI("CUDA/HIP Stream per thread"); + + std::string launchKernelName = addPrefixToName(kernelLaunchAPI); + const IdentifierInfo &launchII = + cgm.getASTContext().Idents.get(launchKernelName); + FunctionDecl *cudaLaunchKernelFD = nullptr; + for (auto *result : dc->lookup(&launchII)) { + if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result)) + cudaLaunchKernelFD = fd; + } + + if (cudaLaunchKernelFD == nullptr) { + cgm.error(cgf.curFuncDecl->getLocation(), + "Can't find declaration for " + launchKernelName); + return; + } + + // Use this function to retrieve arguments for cudaLaunchKernel: + // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t + // *sharedMem, cudaStream_t *stream) + // + // Here [cuda|hip]Stream_t, while also being the 6th argument of + // [cuda|hip]LaunchKernel, is a pointer to some opaque struct. + + mlir::Type dim3Ty = cgf.getTypes().convertType( + cudaLaunchKernelFD->getParamDecl(1)->getType()); + mlir::Type streamTy = cgf.getTypes().convertType( + cudaLaunchKernelFD->getParamDecl(5)->getType()); + + mlir::Value gridDim = + builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty, + "grid_dim", CharUnits::fromQuantity(8)); + mlir::Value blockDim = + builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty, + "block_dim", CharUnits::fromQuantity(8)); + mlir::Value sharedMem = + builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy, + "shared_mem", cgm.getSizeAlign()); + mlir::Value stream = + builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy, + "stream", cgm.getPointerAlign()); + + cir::FuncOp popConfig = cgm.createRuntimeFunction( + cir::FuncType::get({gridDim.getType(), blockDim.getType(), + sharedMem.getType(), stream.getType()}, + cgm.sInt32Ty), + addUnderscoredPrefixToName("PopCallConfiguration")); + cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream}); + + // Now emit the call to cudaLaunchKernel + // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim, + // dim3 blockDim, + // void **args, size_t sharedMem, + // [cuda|hip]Stream_t stream); + + // We now either pick the function or the stub global for cuda, hip + // resepectively. + auto kernel = [&]() { + if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>( + kernelHandles[fn.getSymName()])) { + auto kernelTy = cir::PointerType::get(globalOp.getSymType()); + mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy, + globalOp.getSymName()); + return kernel; + } + if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>( + kernelHandles[fn.getSymName()])) { + auto kernelTy = cir::PointerType::get(funcOp.getFunctionType()); + mlir::Value kernel = + cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName()); + mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy); + return func; + } + assert(false && "Expected stub handle to be cir::GlobalOp or funcOp"); + }(); + + CallArgList launchArgs; + launchArgs.add(RValue::get(kernel), + cudaLaunchKernelFD->getParamDecl(0)->getType()); + launchArgs.add( + RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))), + cudaLaunchKernelFD->getParamDecl(1)->getType()); + launchArgs.add( + RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))), + cudaLaunchKernelFD->getParamDecl(2)->getType()); + launchArgs.add(RValue::get(kernelArgs), + cudaLaunchKernelFD->getParamDecl(3)->getType()); + launchArgs.add( + RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)), + cudaLaunchKernelFD->getParamDecl(4)->getType()); + launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)), + cudaLaunchKernelFD->getParamDecl(5)->getType()); + + mlir::Type launchTy = + cgm.getTypes().convertType(cudaLaunchKernelFD->getType()); + mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction( + cast<cir::FuncType>(launchTy), launchKernelName); + const auto &callInfo = + cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); + cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn), + ReturnValueSlot(), launchArgs); + + if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() && + !cgf.getLangOpts().HIP) + cgm.errorNYI("MSVC CUDA stub handling"); +} + +void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, + FunctionArgList &args) { + + if (auto globalOp = + llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) { + auto &builder = cgm.getBuilder(); + auto fnPtrTy = globalOp.getSymType(); + auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr()); + auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym); + + globalOp->setAttr("initial_value", gv); + globalOp->removeAttr("sym_visibility"); + globalOp->setAttr("alignment", builder.getI64IntegerAttr( + cgm.getPointerAlign().getQuantity())); + } + + // CUDA 9.0 changed the way to launch kernels. + if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_NEW_LAUNCH) || + (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) || + cgm.getLangOpts().OffloadViaLLVM) + emitDeviceStubBodyNew(cgf, fn, args); + else + cgm.errorNYI("Emit Stub Body Legacy"); +} + +CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) { + return new CIRGenNVCUDARuntime(cgm); +} + +CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {} + +mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn, + GlobalDecl GD) { + + // Check if we already have a kernel handle for this function + auto Loc = kernelHandles.find(fn.getSymName()); ---------------- andykaylor wrote:
```suggestion auto it = kernelHandles.find(fn.getSymName()); ``` The prevalence of `loc` in MLIR code makes it a bad identifier to use for anything other than and `MLIR::Location`. https://github.com/llvm/llvm-project/pull/177790 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
