================
@@ -0,0 +1,357 @@
+//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for CUDA code generation targeting the NVIDIA CUDA
+// runtime library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCUDARuntime.h"
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "mlir/IR/Operation.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/GlobalDecl.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "llvm/Support/Casting.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+
+class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
+protected:
+ StringRef Prefix;
+
+ // Map a device stub function to a symbol for identifying kernel in host
+ // code. For CUDA, the symbol for identifying the kernel is the same as the
+ // device stub function. For HIP, they are different.
+ llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles;
+
+ // Map a kernel handle to the kernel stub.
+ llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
+ // Mangle context for device.
+ std::unique_ptr<MangleContext> deviceMC;
+
+private:
+ void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args);
+ mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
+ FunctionArgList &args);
+ mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
+ std::string addPrefixToName(StringRef funcName) const;
+ std::string addUnderscoredPrefixToName(StringRef funcName) const;
+
+public:
+ CIRGenNVCUDARuntime(CIRGenModule &cgm);
+ ~CIRGenNVCUDARuntime();
+
+ void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args) override;
+};
+
+} // namespace
+
+std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
+ return (Prefix + funcName).str();
+}
+
+std::string
+CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
+ return ("__" + Prefix + funcName).str();
+}
+
+static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) {
+ // If the host and device have different C++ ABIs, mark it as the device
+ // mangle context so that the mangling needs to retrieve the additional
+ // device lambda mangling number instead of the regular host one.
+ if (cgm.getASTContext().getAuxTargetInfo() &&
+ cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+ cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
+ return std::unique_ptr<MangleContext>(
+ cgm.getASTContext().createDeviceMangleContext(
+ *cgm.getASTContext().getAuxTargetInfo()));
+ }
+
+ return
std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext(
+ cgm.getASTContext().getAuxTargetInfo()));
+}
+
+CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
+ : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) {
+ if (cgm.getLangOpts().OffloadViaLLVM)
+ llvm_unreachable("NYI");
+ else if (cgm.getLangOpts().HIP)
+ Prefix = "hip";
+ else
+ Prefix = "cuda";
+}
+
+mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
+ mlir::Location loc,
+ FunctionArgList &args) {
+ auto &builder = cgm.getBuilder();
+
+ // Build void *args[] and populate with the addresses of kernel arguments.
+ auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
+ mlir::Value kernelArgs = builder.createAlloca(
+ loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy,
"kernel_args",
+ CharUnits::fromQuantity(16));
+
+ mlir::Value kernelArgsDecayed =
+ builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
+ cir::PointerType::get(cgm.voidPtrTy));
+
+ for (auto [i, arg] : llvm::enumerate(args)) {
+ mlir::Value index =
+ builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
+ mlir::Value storePos =
+ builder.createPtrStride(loc, kernelArgsDecayed, index);
+
+ // Get the address of the argument and cast the store destination to match
+ // its pointer-to-pointer type. This is needed because upstream's
+ // createStore doesn't auto-bitcast like the incubator version.
+ mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
+ mlir::Value storePosTyped = builder.createBitcast(
+ storePos, cir::PointerType::get(argAddr.getType()));
+
+ builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped);
+ }
+
+ return kernelArgsDecayed;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
+ cir::FuncOp fn,
+ FunctionArgList &args) {
+
+ // This requires arguments to be sent to kernels in a different way.
+ if (cgm.getLangOpts().OffloadViaLLVM)
+ cgm.errorNYI("Offload via LLVM");
+
+ auto &builder = cgm.getBuilder();
+ auto loc = fn.getLoc();
+
+ // For [cuda|hip]LaunchKernel, we must add another layer of indirection
+ // to arguments. For example, for function `add(int a, float b)`,
+ // we need to pass it as `void *args[2] = { &a, &b }`.
+ mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
+
+ // Lookup cudaLaunchKernel/hipLaunchKernel function.
+ // HIP kernel launching API name depends on -fgpu-default-stream option. For
+ // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+ // it is hipLaunchKernel_spt.
+ // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3
blockDim,
+ // void **args, size_t sharedMem,
+ // cudaStream_t stream);
+ // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+ // dim3 blockDim, void **args,
+ // size_t sharedMem, hipStream_t stream);
+ TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
+ DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
+
+ // The default stream is usually stream 0 (the legacy default stream).
+ // For per-thread default stream, we need a different LaunchKernel function.
+ std::string kernelLaunchAPI = "LaunchKernel";
+ if (cgm.getLangOpts().GPUDefaultStream ==
+ LangOptions::GPUDefaultStreamKind::PerThread)
+ cgm.errorNYI("CUDA/HIP Stream per thread");
+
+ std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
+ const IdentifierInfo &launchII =
+ cgm.getASTContext().Idents.get(launchKernelName);
+ FunctionDecl *cudaLaunchKernelFD = nullptr;
+ for (auto *result : dc->lookup(&launchII)) {
+ if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
+ cudaLaunchKernelFD = fd;
+ }
+
+ if (cudaLaunchKernelFD == nullptr) {
+ cgm.error(cgf.curFuncDecl->getLocation(),
+ "Can't find declaration for " + launchKernelName);
+ return;
+ }
+
+ // Use this function to retrieve arguments for cudaLaunchKernel:
+ // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
+ // *sharedMem, cudaStream_t *stream)
+ //
+ // Here [cuda|hip]Stream_t, while also being the 6th argument of
+ // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
+
+ mlir::Type dim3Ty = cgf.getTypes().convertType(
+ cudaLaunchKernelFD->getParamDecl(1)->getType());
+ mlir::Type streamTy = cgf.getTypes().convertType(
+ cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+ mlir::Value gridDim =
+ builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+ "grid_dim", CharUnits::fromQuantity(8));
+ mlir::Value blockDim =
+ builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+ "block_dim", CharUnits::fromQuantity(8));
+ mlir::Value sharedMem =
+ builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
+ "shared_mem", cgm.getSizeAlign());
+ mlir::Value stream =
+ builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
+ "stream", cgm.getPointerAlign());
+
+ cir::FuncOp popConfig = cgm.createRuntimeFunction(
+ cir::FuncType::get({gridDim.getType(), blockDim.getType(),
+ sharedMem.getType(), stream.getType()},
+ cgm.sInt32Ty),
+ addUnderscoredPrefixToName("PopCallConfiguration"));
+ cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
+
+ // Now emit the call to cudaLaunchKernel
+ // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
+ // dim3 blockDim,
+ // void **args, size_t sharedMem,
+ // [cuda|hip]Stream_t stream);
+
+ // We now either pick the function or the stub global for cuda, hip
+ // resepectively.
+ auto kernel = [&]() {
+ if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
+ kernelHandles[fn.getSymName()])) {
+ auto kernelTy = cir::PointerType::get(globalOp.getSymType());
+ mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy,
+ globalOp.getSymName());
+ return kernel;
+ }
+ if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
+ kernelHandles[fn.getSymName()])) {
+ auto kernelTy = cir::PointerType::get(funcOp.getFunctionType());
+ mlir::Value kernel =
+ cir::GetGlobalOp::create(builder, loc, kernelTy,
funcOp.getSymName());
+ mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy);
+ return func;
+ }
+ assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
+ }();
+
+ CallArgList launchArgs;
+ launchArgs.add(RValue::get(kernel),
+ cudaLaunchKernelFD->getParamDecl(0)->getType());
+ launchArgs.add(
+ RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
+ cudaLaunchKernelFD->getParamDecl(1)->getType());
+ launchArgs.add(
+ RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
+ cudaLaunchKernelFD->getParamDecl(2)->getType());
+ launchArgs.add(RValue::get(kernelArgs),
+ cudaLaunchKernelFD->getParamDecl(3)->getType());
+ launchArgs.add(
+ RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
+ cudaLaunchKernelFD->getParamDecl(4)->getType());
+ launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc,
stream)),
+ cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+ mlir::Type launchTy =
+ cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
+ mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
+ cast<cir::FuncType>(launchTy), launchKernelName);
+ const auto &callInfo =
+ cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+ cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
+ ReturnValueSlot(), launchArgs);
+
+ if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+ !cgf.getLangOpts().HIP)
+ cgm.errorNYI("MSVC CUDA stub handling");
+}
+
+void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args) {
+
+ if (auto globalOp =
+ llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
+ auto &builder = cgm.getBuilder();
+ auto fnPtrTy = globalOp.getSymType();
+ auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
+ auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
+
+ globalOp->setAttr("initial_value", gv);
+ globalOp->removeAttr("sym_visibility");
+ globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+ cgm.getPointerAlign().getQuantity()));
+ }
+
+ // CUDA 9.0 changed the way to launch kernels.
+ if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
+ CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+ (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
+ cgm.getLangOpts().OffloadViaLLVM)
+ emitDeviceStubBodyNew(cgf, fn, args);
+ else
+ cgm.errorNYI("Emit Stub Body Legacy");
+}
+
+CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
+ return new CIRGenNVCUDARuntime(cgm);
+}
+
+CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
+
+mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
+ GlobalDecl GD) {
+
+ // Check if we already have a kernel handle for this function
+ auto Loc = kernelHandles.find(fn.getSymName());
+ if (Loc != kernelHandles.end()) {
+ auto OldHandle = Loc->second;
----------------
andykaylor wrote:
```suggestion
mlir::Operation *oldHandle = Loc->second;
```
https://github.com/llvm/llvm-project/pull/177790
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits