https://github.com/RiverDave updated 
https://github.com/llvm/llvm-project/pull/177790

>From f703a61ff33f2d8026cf4bece0416afc331e4dbf Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Sat, 24 Jan 2026 14:12:48 -0500
Subject: [PATCH 1/3] [CIR][CUDA] Upstream device stub mangling

---
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  4 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 10 ++-
 .../test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 22 ++++++
 clang/test/CIR/CodeGen/inputs/cuda.h          | 74 +++++++++++++++++++
 4 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
 create mode 100644 clang/test/CIR/CodeGen/inputs/cuda.h

diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp 
b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index f2d73720a9c2b..4c212b06019ea 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -748,7 +748,9 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl 
gd, cir::FuncOp fn,
       emitConstructorBody(args);
     } else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
                funcDecl->hasAttr<CUDAGlobalAttr>()) {
-      getCIRGenModule().errorNYI(bodyRange, "CUDA kernel");
+      // TODO(cir): Emit device stub body with kernel launch runtime calls
+      // (emitDeviceStub). For now, emit an empty stub.
+      assert(!cir::MissingFeatures::cudaSupport());
     } else if (isa<CXXMethodDecl>(funcDecl) &&
                cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) {
       // The lambda static invoker function is special, because it forwards or
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 61d84f197e6ec..b535eab913a5d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1772,9 +1772,15 @@ static std::string getMangledNameImpl(CIRGenModule &cgm, 
GlobalDecl gd,
       cgm.errorNYI(nd->getSourceRange(), "getMangledName: X86RegCall");
     } else if (fd && fd->hasAttr<CUDAGlobalAttr>() &&
                gd.getKernelReferenceKind() == KernelReferenceKind::Stub) {
-      cgm.errorNYI(nd->getSourceRange(), "getMangledName: CUDA device stub");
+      out << "__device_stub__" << ii->getName();
+    } else if (fd &&
+               DeviceKernelAttr::isOpenCLSpelling(
+                   fd->getAttr<DeviceKernelAttr>()) &&
+               gd.getKernelReferenceKind() == KernelReferenceKind::Stub) {
+      cgm.errorNYI(nd->getSourceRange(), "getMangledName: OpenCL Stub");
+    } else {
+      out << ii->getName();
     }
-    out << ii->getName();
   }
 
   // Check if the module name hash should be appended for internal linkage
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu 
b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
new file mode 100644
index 0000000000000..6d5efb69827e3
--- /dev/null
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -0,0 +1,22 @@
+// Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
+// RUN:   -x cuda -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+#include "../inputs/cuda.h"
+
+// CHECK: cir.func {{.*}} @__device_stub__ckernel()
+// CHECK-NEXT:   cir.return
+// CHECK-NEXT: }
+extern "C" __global__ void ckernel() {}
+
+// CHECK: cir.func {{.*}} @_ZN2ns23__device_stub__nskernelEv()
+namespace ns {
+__global__ void nskernel() {}
+} // namespace ns
+
+// CHECK: cir.func {{.*}} @_Z25__device_stub__kernelfuncIiEvv()
+template <class T>
+__global__ void kernelfunc() {}
+template __global__ void kernelfunc<int>();
diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h 
b/clang/test/CIR/CodeGen/inputs/cuda.h
new file mode 100644
index 0000000000000..204bf2972088d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/inputs/cuda.h
@@ -0,0 +1,74 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+/* From test/CodeGenCUDA/Inputs/cuda.h. */
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#define __grid_constant__
+#endif
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), 
y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 
blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+
+#endif
+
+extern "C" __device__ int printf(const char*, ...);

>From 1892d27374892cb04af492c4cc63a9129056d257 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Sat, 24 Jan 2026 14:39:09 -0500
Subject: [PATCH 2/3] make test include cleaner

---
 clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu 
b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
index 6d5efb69827e3..da2dbd9843c7c 100644
--- a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -1,10 +1,10 @@
 // Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
-// RUN:   -x cuda -o %t.cir
+// RUN:   -I%S/../inputs/ -x cuda -o %t.cir
 // RUN: FileCheck --input-file=%t.cir %s
 
-#include "../inputs/cuda.h"
+#include "cuda.h"
 
 // CHECK: cir.func {{.*}} @__device_stub__ckernel()
 // CHECK-NEXT:   cir.return

>From 601edf35ccd87c52d5319922238ebb3196265a6c Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Tue, 27 Jan 2026 15:59:09 -0500
Subject: [PATCH 3/3] [CIR][CUDA][HIP] Implement stub body emission

---
 clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp        | 358 ++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp   |  20 +
 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h     |  50 +++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |   4 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  14 +-
 clang/lib/CIR/CodeGen/CIRGenModule.h          |   9 +
 clang/lib/CIR/CodeGen/CMakeLists.txt          |   2 +
 clang/test/CIR/CodeGen/CUDA/kernel-call.cu    |  18 +
 .../test/CIR/CodeGen/CUDA/kernel-stub-name.cu |   8 +-
 clang/test/CIR/CodeGen/inputs/cuda.h          |   6 +
 10 files changed, 481 insertions(+), 8 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
 create mode 100644 clang/test/CIR/CodeGen/CUDA/kernel-call.cu

diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp 
b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
new file mode 100644
index 0000000000000..acdc811b7a308
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -0,0 +1,358 @@
+//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for CUDA CIR generation. Concrete
+// subclasses of this implement code generation for specific OpenCL
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCUDARuntime.h"
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "mlir/IR/Operation.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/GlobalDecl.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "llvm/Support/Casting.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+
+class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
+protected:
+  StringRef Prefix;
+
+  // Map a device stub function to a symbol for identifying kernel in host
+  // code. For CUDA, the symbol for identifying the kernel is the same as the
+  // device stub function. For HIP, they are different.
+  llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles;
+
+  // Map a kernel handle to the kernel stub.
+  llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
+  // Mangle context for device.
+  std::unique_ptr<MangleContext> deviceMC;
+
+private:
+  void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
+                             FunctionArgList &args);
+  mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
+                                FunctionArgList &args);
+  mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
+  std::string addPrefixToName(StringRef funcName) const;
+  std::string addUnderscoredPrefixToName(StringRef funcName) const;
+
+public:
+  CIRGenNVCUDARuntime(CIRGenModule &cgm);
+  ~CIRGenNVCUDARuntime();
+
+  void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+                      FunctionArgList &args) override;
+};
+
+} // namespace
+
+std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
+  return (Prefix + funcName).str();
+}
+
+std::string
+CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
+  return ("__" + Prefix + funcName).str();
+}
+
+static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) {
+  // If the host and device have different C++ ABIs, mark it as the device
+  // mangle context so that the mangling needs to retrieve the additional
+  // device lambda mangling number instead of the regular host one.
+  if (cgm.getASTContext().getAuxTargetInfo() &&
+      cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+      cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
+    return std::unique_ptr<MangleContext>(
+        cgm.getASTContext().createDeviceMangleContext(
+            *cgm.getASTContext().getAuxTargetInfo()));
+  }
+
+  return 
std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext(
+      cgm.getASTContext().getAuxTargetInfo()));
+}
+
+CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
+    : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) {
+  if (cgm.getLangOpts().OffloadViaLLVM)
+    llvm_unreachable("NYI");
+  else if (cgm.getLangOpts().HIP)
+    Prefix = "hip";
+  else
+    Prefix = "cuda";
+}
+
+mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
+                                                   mlir::Location loc,
+                                                   FunctionArgList &args) {
+  auto &builder = cgm.getBuilder();
+
+  // Build void *args[] and populate with the addresses of kernel arguments.
+  auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
+  mlir::Value kernelArgs = builder.createAlloca(
+      loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, 
"kernel_args",
+      CharUnits::fromQuantity(16));
+
+  mlir::Value kernelArgsDecayed =
+      builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
+                         cir::PointerType::get(cgm.voidPtrTy));
+
+  for (auto [i, arg] : llvm::enumerate(args)) {
+    mlir::Value index =
+        builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
+    mlir::Value storePos =
+        builder.createPtrStride(loc, kernelArgsDecayed, index);
+
+    // Get the address of the argument and cast the store destination to match
+    // its pointer-to-pointer type. This is needed because upstream's
+    // createStore doesn't auto-bitcast like the incubator version.
+    mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
+    mlir::Value storePosTyped = builder.createBitcast(
+        storePos, cir::PointerType::get(argAddr.getType()));
+
+    builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped);
+  }
+
+  return kernelArgsDecayed;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
+                                                cir::FuncOp fn,
+                                                FunctionArgList &args) {
+
+  // This requires arguments to be sent to kernels in a different way.
+  if (cgm.getLangOpts().OffloadViaLLVM)
+    cgm.errorNYI("Offload via LLVM");
+
+  auto &builder = cgm.getBuilder();
+  auto loc = fn.getLoc();
+
+  // For [cuda|hip]LaunchKernel, we must add another layer of indirection
+  // to arguments. For example, for function `add(int a, float b)`,
+  // we need to pass it as `void *args[2] = { &a, &b }`.
+  mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
+
+  // Lookup cudaLaunchKernel/hipLaunchKernel function.
+  // HIP kernel launching API name depends on -fgpu-default-stream option. For
+  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+  // it is hipLaunchKernel_spt.
+  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 
blockDim,
+  //                              void **args, size_t sharedMem,
+  //                              cudaStream_t stream);
+  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+  //                                  dim3 blockDim, void **args,
+  //                                  size_t sharedMem, hipStream_t stream);
+  TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
+  DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
+
+  // The default stream is usually stream 0 (the legacy default stream).
+  // For per-thread default stream, we need a different LaunchKernel function.
+  std::string kernelLaunchAPI = "LaunchKernel";
+  if (cgm.getLangOpts().GPUDefaultStream ==
+      LangOptions::GPUDefaultStreamKind::PerThread)
+    cgm.errorNYI("CUDA/HIP Stream per thread");
+
+  std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
+  const IdentifierInfo &launchII =
+      cgm.getASTContext().Idents.get(launchKernelName);
+  FunctionDecl *cudaLaunchKernelFD = nullptr;
+  for (auto *result : dc->lookup(&launchII)) {
+    if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
+      cudaLaunchKernelFD = fd;
+  }
+
+  if (cudaLaunchKernelFD == nullptr) {
+    cgm.error(cgf.curFuncDecl->getLocation(),
+              "Can't find declaration for " + launchKernelName);
+    return;
+  }
+
+  // Use this function to retrieve arguments for cudaLaunchKernel:
+  // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
+  //                                *sharedMem, cudaStream_t *stream)
+  //
+  // Here [cuda|hip]Stream_t, while also being the 6th argument of
+  // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
+
+  mlir::Type dim3Ty = cgf.getTypes().convertType(
+      cudaLaunchKernelFD->getParamDecl(1)->getType());
+  mlir::Type streamTy = cgf.getTypes().convertType(
+      cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+  mlir::Value gridDim =
+      builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+                           "grid_dim", CharUnits::fromQuantity(8));
+  mlir::Value blockDim =
+      builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+                           "block_dim", CharUnits::fromQuantity(8));
+  mlir::Value sharedMem =
+      builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
+                           "shared_mem", cgm.getSizeAlign());
+  mlir::Value stream =
+      builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
+                           "stream", cgm.getPointerAlign());
+
+  cir::FuncOp popConfig = cgm.createRuntimeFunction(
+      cir::FuncType::get({gridDim.getType(), blockDim.getType(),
+                          sharedMem.getType(), stream.getType()},
+                         cgm.sInt32Ty),
+      addUnderscoredPrefixToName("PopCallConfiguration"));
+  cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
+
+  // Now emit the call to cudaLaunchKernel
+  // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
+  // dim3 blockDim,
+  //                              void **args, size_t sharedMem,
+  //                              [cuda|hip]Stream_t stream);
+
+  // We now either pick the function or the stub global for cuda, hip
+  // resepectively.
+  auto kernel = [&]() {
+    if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
+            kernelHandles[fn.getSymName()])) {
+      auto kernelTy = cir::PointerType::get(globalOp.getSymType());
+      mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy,
+                                                    globalOp.getSymName());
+      return kernel;
+    }
+    if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
+            kernelHandles[fn.getSymName()])) {
+      auto kernelTy = cir::PointerType::get(funcOp.getFunctionType());
+      mlir::Value kernel =
+          cir::GetGlobalOp::create(builder, loc, kernelTy, 
funcOp.getSymName());
+      mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy);
+      return func;
+    }
+    assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
+  }();
+
+  CallArgList launchArgs;
+  launchArgs.add(RValue::get(kernel),
+                 cudaLaunchKernelFD->getParamDecl(0)->getType());
+  launchArgs.add(
+      RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
+      cudaLaunchKernelFD->getParamDecl(1)->getType());
+  launchArgs.add(
+      RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
+      cudaLaunchKernelFD->getParamDecl(2)->getType());
+  launchArgs.add(RValue::get(kernelArgs),
+                 cudaLaunchKernelFD->getParamDecl(3)->getType());
+  launchArgs.add(
+      RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
+      cudaLaunchKernelFD->getParamDecl(4)->getType());
+  launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, 
stream)),
+                 cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+  mlir::Type launchTy =
+      cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
+  mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
+      cast<cir::FuncType>(launchTy), launchKernelName);
+  const auto &callInfo =
+      cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+  cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
+               ReturnValueSlot(), launchArgs);
+
+  if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+      !cgf.getLangOpts().HIP)
+    cgm.errorNYI("MSVC CUDA stub handling");
+}
+
+void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+                                         FunctionArgList &args) {
+
+  if (auto globalOp =
+          llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
+    auto &builder = cgm.getBuilder();
+    auto fnPtrTy = globalOp.getSymType();
+    auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
+    auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
+
+    globalOp->setAttr("initial_value", gv);
+    globalOp->removeAttr("sym_visibility");
+    globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+                                       cgm.getPointerAlign().getQuantity()));
+  }
+
+  // CUDA 9.0 changed the way to launch kernels.
+  if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
+                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+      (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
+      cgm.getLangOpts().OffloadViaLLVM)
+    emitDeviceStubBodyNew(cgf, fn, args);
+  else
+    cgm.errorNYI("Emit Stub Body Legacy");
+}
+
+CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
+  return new CIRGenNVCUDARuntime(cgm);
+}
+
+CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
+
+mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
+                                                      GlobalDecl GD) {
+
+  // Check if we already have a kernel handle for this function
+  auto Loc = kernelHandles.find(fn.getSymName());
+  if (Loc != kernelHandles.end()) {
+    auto OldHandle = Loc->second;
+    // Here we know that the fn did not change. Return it
+    if (kernelStubs[OldHandle] == fn)
+      return OldHandle;
+
+    // We've found the function name, but F itself has changed, so we need to
+    // update the references.
+    if (cgm.getLangOpts().HIP) {
+      // For HIP compilation the handle itself does not change, so we only need
+      // to update the Stub value.
+      kernelStubs[OldHandle] = fn;
+      return OldHandle;
+    }
+    // For non-HIP compilation, erase the old Stub and fall-through to creating
+    // new entries.
+    kernelStubs.erase(OldHandle);
+  }
+
+  // If not targeting HIP, store the function itself
+  if (!cgm.getLangOpts().HIP) {
+    kernelHandles[fn.getSymName()] = fn;
+    kernelStubs[fn] = fn;
+    return fn;
+  }
+
+  // Create a new CIR global variable to represent the kernel handle
+  auto &builder = cgm.getBuilder();
+  auto globalName = cgm.getMangledName(
+      GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
+  const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(GD.getDecl());
+  cir::GlobalOp globalOp =
+      cgm.getOrCreateCIRGlobal(globalName, 
fn.getFunctionType().getReturnType(),
+                               LangAS::Default, varDecl, NotForDefinition);
+
+  globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+                                     cgm.getPointerAlign().getQuantity()));
+
+  // Store references
+  kernelHandles[fn.getSymName()] = globalOp;
+  kernelStubs[globalOp] = fn;
+
+  return globalOp;
+}
\ No newline at end of file
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp 
b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
new file mode 100644
index 0000000000000..c438c968c24ce
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
@@ -0,0 +1,20 @@
+//===----- CIRGenCUDARuntime.cpp - Interface to CUDA Runtimes 
-------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for CUDA CIR generation.  Concrete
+// subclasses of this implement code generation for specific CUDA
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCUDARuntime.h"
+
+using namespace clang;
+using namespace CIRGen;
+
+CIRGenCUDARuntime::~CIRGenCUDARuntime() {}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h 
b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
new file mode 100644
index 0000000000000..a0809c1d185b8
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
@@ -0,0 +1,50 @@
+//===------ CIRGenCUDARuntime.h - Interface to CUDA Runtimes -----*- C++ 
-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for CUDA CIR generation. Concrete
+// subclasses of this implement code generation for specific OpenCL
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
+#define LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
+
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+
+namespace clang {
+class CUDAKernelCallExpr;
+}
+
+namespace clang::CIRGen {
+
+class CIRGenFunction;
+class CIRGenModule;
+class FunctionArgList;
+class RValue;
+class ReturnValueSlot;
+
+class CIRGenCUDARuntime {
+protected:
+  CIRGenModule &cgm;
+
+public:
+  CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {}
+  virtual ~CIRGenCUDARuntime();
+
+  virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+                              FunctionArgList &args) = 0;
+
+  virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) = 0;
+};
+
+CIRGenCUDARuntime *createNVCUDARuntime(CIRGenModule &cgm);
+
+} // namespace clang::CIRGen
+
+#endif // LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp 
b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 4c212b06019ea..c900797e54c81 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -748,9 +748,7 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl 
gd, cir::FuncOp fn,
       emitConstructorBody(args);
     } else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
                funcDecl->hasAttr<CUDAGlobalAttr>()) {
-      // TODO(cir): Emit device stub body with kernel launch runtime calls
-      // (emitDeviceStub). For now, emit an empty stub.
-      assert(!cir::MissingFeatures::cudaSupport());
+      cgm.getCUDARuntime().emitDeviceStub(*this, fn, args);
     } else if (isa<CXXMethodDecl>(funcDecl) &&
                cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) {
       // The lambda static invoker function is special, because it forwards or
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index b535eab913a5d..8cef5408bbfc1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -11,6 +11,7 @@
 
//===----------------------------------------------------------------------===//
 
 #include "CIRGenModule.h"
+#include "CIRGenCUDARuntime.h"
 #include "CIRGenCXXABI.h"
 #include "CIRGenConstantEmitter.h"
 #include "CIRGenFunction.h"
@@ -31,6 +32,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/Verifier.h"
 
 #include <algorithm>
@@ -68,7 +70,8 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
       langOpts(astContext.getLangOpts()), codeGenOpts(cgo),
       theModule{mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))},
       diags(diags), target(astContext.getTargetInfo()),
-      abi(createCXXABI(*this)), genTypes(*this), vtables(*this) {
+      abi(createCXXABI(*this)), genTypes(*this), vtables(*this),
+      cudaRuntime(clang::CIRGen::createNVCUDARuntime((*this))) {
 
   // Initialize cached types
   voidTy = cir::VoidType::get(&getMLIRContext());
@@ -1748,6 +1751,15 @@ cir::FuncOp 
CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd,
   cir::FuncOp func =
       getOrCreateCIRFunction(mangledName, funcType, gd, forVTable, dontDefer,
                              /*isThunk=*/false, isForDefinition);
+  // Returns kernel handle for HIP kernel stub function.
+  if (langOpts.CUDA && !langOpts.CUDAIsDevice &&
+      cast<FunctionDecl>(gd.getDecl())->hasAttr<CUDAGlobalAttr>()) {
+    mlir::Operation *handle = getCUDARuntime().getKernelHandle(func, gd);
+
+    if (isForDefinition)
+      return func;
+    return mlir::dyn_cast<cir::FuncOp>(*handle);
+  }
   return func;
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h 
b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 3c4f35bacc4f9..6f301bd83d373 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -15,6 +15,7 @@
 
 #include "CIRGenBuilder.h"
 #include "CIRGenCall.h"
+#include "CIRGenCUDARuntime.h"
 #include "CIRGenTypeCache.h"
 #include "CIRGenTypes.h"
 #include "CIRGenVTables.h"
@@ -90,6 +91,9 @@ class CIRGenModule : public CIRGenTypeCache {
   /// Holds information about C++ vtables.
   CIRGenVTables vtables;
 
+  /// Holds the CUDA runtime
+  std::unique_ptr<CIRGenCUDARuntime> cudaRuntime;
+
   /// Per-function codegen information. Updated everytime emitCIR is called
   /// for FunctionDecls's.
   CIRGenFunction *curCGF = nullptr;
@@ -593,6 +597,11 @@ class CIRGenModule : public CIRGenTypeCache {
   /// Function* for "fabsf".
   cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned 
builtinID);
 
+  CIRGenCUDARuntime &getCUDARuntime() {
+    assert(cudaRuntime != nullptr);
+    return *cudaRuntime;
+  }
+
   mlir::IntegerAttr getSize(CharUnits size) {
     return builder.getSizeFromCharUnits(size);
   }
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt 
b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 8efa587f31aac..ff5e666a72bef 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -18,6 +18,8 @@ add_clang_library(clangCIR
   CIRGenClass.cpp
   CIRGenCleanup.cpp
   CIRGenCoroutine.cpp
+  CIRGenCUDANV.cpp
+  CIRGenCUDARuntime.cpp
   CIRGenCXX.cpp
   CIRGenCXXABI.cpp
   CIRGenDecl.cpp
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu 
b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
new file mode 100644
index 0000000000000..d1dae134b0230
--- /dev/null
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
@@ -0,0 +1,18 @@
+// Based on clang/test/CodeGenCUDA/kernel-call.cu.
+// Tests device stub body emission for CUDA kernels.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN:   -emit-cir %s -I%S/../inputs/ -x cuda -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CUDA-NEW
+
+
+#include "cuda.h"
+
+
+// TODO: Test CUDA legacy (< 9.0) when legacy stub body is implemented
+// TODO: Test HIP when HIP stub body support is complete
+
+// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelv
+// CUDA-NEW: cir.call @__cudaPopCallConfiguration
+// CUDA-NEW: cir.call @cudaLaunchKernel
+__global__ void kernel() {}
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu 
b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
index da2dbd9843c7c..0edf256ccf961 100644
--- a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -1,13 +1,13 @@
 // Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
 
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
-// RUN:   -I%S/../inputs/ -x cuda -o %t.cir
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN:   -emit-cir %s -I%S/../inputs/ -x cuda -o %t.cir
 // RUN: FileCheck --input-file=%t.cir %s
 
 #include "cuda.h"
 
-// CHECK: cir.func {{.*}} @__device_stub__ckernel()
-// CHECK-NEXT:   cir.return
+// CHECK: cir.func {{.*}} @[[CSTUB:__device_stub__ckernel]]()
+// CHECK: cir.return
 // CHECK-NEXT: }
 extern "C" __global__ void ckernel() {}
 
diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h 
b/clang/test/CIR/CodeGen/inputs/cuda.h
index 204bf2972088d..225c7dfdcf0db 100644
--- a/clang/test/CIR/CodeGen/inputs/cuda.h
+++ b/clang/test/CIR/CodeGen/inputs/cuda.h
@@ -37,6 +37,9 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t 
sharedSize = 0,
 extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
                                                  size_t sharedSize = 0,
                                                  hipStream_t stream = 0);
+extern "C" int __hipPopCallConfiguration(dim3 *gridSize, dim3 *blockSize,
+                                         size_t *sharedSize,
+                                         hipStream_t *stream);
 #ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
 extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
                                       dim3 blockDim, void **args,
@@ -62,6 +65,9 @@ extern "C" int cudaConfigureCall(dim3 gridSize, dim3 
blockSize,
 extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
                                            size_t sharedSize = 0,
                                            cudaStream_t stream = 0);
+extern "C" int __cudaPopCallConfiguration(dim3 *gridSize, dim3 *blockSize,
+                                          size_t *sharedSize,
+                                          cudaStream_t *stream);
 extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
                                         dim3 blockDim, void **args,
                                         size_t sharedMem, cudaStream_t stream);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to