[clang] [CIR][CUDA] Upstream device stub body emission and name mangling (PR #177790)

Andy Kaylor via cfe-commits Tue, 27 Jan 2026 15:57:03 -0800

================
@@ -0,0 +1,357 @@
+//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for CUDA code generation targeting the NVIDIA CUDA
+// runtime library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCUDARuntime.h"
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "mlir/IR/Operation.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/GlobalDecl.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "llvm/Support/Casting.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+
+class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
+protected:
+  StringRef Prefix;
+
+  // Map a device stub function to a symbol for identifying kernel in host
+  // code. For CUDA, the symbol for identifying the kernel is the same as the
+  // device stub function. For HIP, they are different.
+  llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles;
+
+  // Map a kernel handle to the kernel stub.
+  llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
+  // Mangle context for device.
+  std::unique_ptr<MangleContext> deviceMC;
+
+private:
+  void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
+                             FunctionArgList &args);
+  mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
+                                FunctionArgList &args);
+  mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
+  std::string addPrefixToName(StringRef funcName) const;
+  std::string addUnderscoredPrefixToName(StringRef funcName) const;
+
+public:
+  CIRGenNVCUDARuntime(CIRGenModule &cgm);
+  ~CIRGenNVCUDARuntime();
+
+  void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+                      FunctionArgList &args) override;
+};
+
+} // namespace
+
+std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
+  return (Prefix + funcName).str();
+}
+
+std::string
+CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
+  return ("__" + Prefix + funcName).str();
+}
+
+static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) {
+  // If the host and device have different C++ ABIs, mark it as the device
+  // mangle context so that the mangling needs to retrieve the additional
+  // device lambda mangling number instead of the regular host one.
+  if (cgm.getASTContext().getAuxTargetInfo() &&
+      cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+      cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
+    return std::unique_ptr<MangleContext>(
+        cgm.getASTContext().createDeviceMangleContext(
+            *cgm.getASTContext().getAuxTargetInfo()));
+  }
+
+  return 
std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext(
+      cgm.getASTContext().getAuxTargetInfo()));
+}
+
+CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
+    : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) {
+  if (cgm.getLangOpts().OffloadViaLLVM)
+    llvm_unreachable("NYI");
+  else if (cgm.getLangOpts().HIP)
+    Prefix = "hip";
+  else
+    Prefix = "cuda";
+}
+
+mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
+                                                   mlir::Location loc,
+                                                   FunctionArgList &args) {
+  auto &builder = cgm.getBuilder();
+
+  // Build void *args[] and populate with the addresses of kernel arguments.
+  auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
+  mlir::Value kernelArgs = builder.createAlloca(
+      loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, 
"kernel_args",
+      CharUnits::fromQuantity(16));
+
+  mlir::Value kernelArgsDecayed =
+      builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
+                         cir::PointerType::get(cgm.voidPtrTy));
+
+  for (auto [i, arg] : llvm::enumerate(args)) {
+    mlir::Value index =
+        builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
+    mlir::Value storePos =
+        builder.createPtrStride(loc, kernelArgsDecayed, index);
+
+    // Get the address of the argument and cast the store destination to match
+    // its pointer-to-pointer type. This is needed because upstream's
+    // createStore doesn't auto-bitcast like the incubator version.
+    mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
+    mlir::Value storePosTyped = builder.createBitcast(
+        storePos, cir::PointerType::get(argAddr.getType()));
+
+    builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped);
+  }
+
+  return kernelArgsDecayed;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
+                                                cir::FuncOp fn,
+                                                FunctionArgList &args) {
+
+  // This requires arguments to be sent to kernels in a different way.
+  if (cgm.getLangOpts().OffloadViaLLVM)
+    cgm.errorNYI("Offload via LLVM");
+
+  auto &builder = cgm.getBuilder();
+  auto loc = fn.getLoc();
+
+  // For [cuda|hip]LaunchKernel, we must add another layer of indirection
+  // to arguments. For example, for function `add(int a, float b)`,
+  // we need to pass it as `void *args[2] = { &a, &b }`.
+  mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
+
+  // Lookup cudaLaunchKernel/hipLaunchKernel function.
+  // HIP kernel launching API name depends on -fgpu-default-stream option. For
+  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+  // it is hipLaunchKernel_spt.
+  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 
blockDim,
+  //                              void **args, size_t sharedMem,
+  //                              cudaStream_t stream);
+  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+  //                                  dim3 blockDim, void **args,
+  //                                  size_t sharedMem, hipStream_t stream);
+  TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
+  DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
+
+  // The default stream is usually stream 0 (the legacy default stream).
+  // For per-thread default stream, we need a different LaunchKernel function.
+  std::string kernelLaunchAPI = "LaunchKernel";
+  if (cgm.getLangOpts().GPUDefaultStream ==
+      LangOptions::GPUDefaultStreamKind::PerThread)
+    cgm.errorNYI("CUDA/HIP Stream per thread");
+
+  std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
+  const IdentifierInfo &launchII =
+      cgm.getASTContext().Idents.get(launchKernelName);
+  FunctionDecl *cudaLaunchKernelFD = nullptr;
+  for (auto *result : dc->lookup(&launchII)) {
+    if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
+      cudaLaunchKernelFD = fd;
+  }
+
+  if (cudaLaunchKernelFD == nullptr) {
+    cgm.error(cgf.curFuncDecl->getLocation(),
+              "Can't find declaration for " + launchKernelName);
+    return;
+  }
+
+  // Use this function to retrieve arguments for cudaLaunchKernel:
+  // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
+  //                                *sharedMem, cudaStream_t *stream)
+  //
+  // Here [cuda|hip]Stream_t, while also being the 6th argument of
+  // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
+
+  mlir::Type dim3Ty = cgf.getTypes().convertType(
+      cudaLaunchKernelFD->getParamDecl(1)->getType());
+  mlir::Type streamTy = cgf.getTypes().convertType(
+      cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+  mlir::Value gridDim =
+      builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+                           "grid_dim", CharUnits::fromQuantity(8));
+  mlir::Value blockDim =
+      builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+                           "block_dim", CharUnits::fromQuantity(8));
+  mlir::Value sharedMem =
+      builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
+                           "shared_mem", cgm.getSizeAlign());
+  mlir::Value stream =
+      builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
+                           "stream", cgm.getPointerAlign());
+
+  cir::FuncOp popConfig = cgm.createRuntimeFunction(
+      cir::FuncType::get({gridDim.getType(), blockDim.getType(),
+                          sharedMem.getType(), stream.getType()},
+                         cgm.sInt32Ty),
+      addUnderscoredPrefixToName("PopCallConfiguration"));
+  cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
+
+  // Now emit the call to cudaLaunchKernel
+  // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
+  // dim3 blockDim,
+  //                              void **args, size_t sharedMem,
+  //                              [cuda|hip]Stream_t stream);
+
+  // We now either pick the function or the stub global for cuda, hip
+  // resepectively.
+  auto kernel = [&]() {
+    if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
+            kernelHandles[fn.getSymName()])) {
+      auto kernelTy = cir::PointerType::get(globalOp.getSymType());
+      mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy,
+                                                    globalOp.getSymName());
+      return kernel;
+    }
+    if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
+            kernelHandles[fn.getSymName()])) {
+      auto kernelTy = cir::PointerType::get(funcOp.getFunctionType());
+      mlir::Value kernel =
+          cir::GetGlobalOp::create(builder, loc, kernelTy, 
funcOp.getSymName());
+      mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy);
+      return func;
+    }
+    assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
+  }();
+
+  CallArgList launchArgs;
+  launchArgs.add(RValue::get(kernel),
+                 cudaLaunchKernelFD->getParamDecl(0)->getType());
+  launchArgs.add(
+      RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
+      cudaLaunchKernelFD->getParamDecl(1)->getType());
+  launchArgs.add(
+      RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
+      cudaLaunchKernelFD->getParamDecl(2)->getType());
+  launchArgs.add(RValue::get(kernelArgs),
+                 cudaLaunchKernelFD->getParamDecl(3)->getType());
+  launchArgs.add(
+      RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
+      cudaLaunchKernelFD->getParamDecl(4)->getType());
+  launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, 
stream)),
+                 cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+  mlir::Type launchTy =
+      cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
+  mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
+      cast<cir::FuncType>(launchTy), launchKernelName);
+  const auto &callInfo =
+      cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+  cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
+               ReturnValueSlot(), launchArgs);
+
+  if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+      !cgf.getLangOpts().HIP)
+    cgm.errorNYI("MSVC CUDA stub handling");
+}
+
+void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+                                         FunctionArgList &args) {
+
+  if (auto globalOp =
+          llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
+    auto &builder = cgm.getBuilder();
+    auto fnPtrTy = globalOp.getSymType();
+    auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
+    auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
+
+    globalOp->setAttr("initial_value", gv);
+    globalOp->removeAttr("sym_visibility");
+    globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+                                       cgm.getPointerAlign().getQuantity()));
+  }
+
+  // CUDA 9.0 changed the way to launch kernels.
+  if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
+                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+      (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
+      cgm.getLangOpts().OffloadViaLLVM)
+    emitDeviceStubBodyNew(cgf, fn, args);
+  else
+    cgm.errorNYI("Emit Stub Body Legacy");
+}
+
+CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
+  return new CIRGenNVCUDARuntime(cgm);
+}
+
+CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
+
+mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
+                                                      GlobalDecl GD) {
+
+  // Check if we already have a kernel handle for this function
+  auto Loc = kernelHandles.find(fn.getSymName());
----------------
andykaylor wrote:


```suggestion
  auto it = kernelHandles.find(fn.getSymName());
```
The prevalence of `loc` in MLIR code makes it a bad identifier to use for 
anything other than and `MLIR::Location`.

https://github.com/llvm/llvm-project/pull/177790
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][CUDA] Upstream device stub body emission and name mangling (PR #177790)

Reply via email to