https://github.com/RiverDave updated 
https://github.com/llvm/llvm-project/pull/187636

>From 2e61735445211aa3b15dfbcafad1209ec124016d Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Wed, 18 Mar 2026 21:10:59 -0400
Subject: [PATCH 01/11] [CIR][CUDA] Global emission for fatbin symbols

---
 .../clang/CIR/Dialect/IR/CIRCUDAAttrs.td      |  17 ++
 .../clang/CIR/Dialect/IR/CIRDialect.td        |   1 +
 clang/include/clang/CIR/MissingFeatures.h     |   2 +
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  10 ++
 .../Dialect/Transforms/LoweringPrepare.cpp    | 154 ++++++++++++++++++
 clang/test/CIR/CodeGenCUDA/device-stub.cu     |  50 ++++++
 6 files changed, 234 insertions(+)
 create mode 100644 clang/test/CIR/CodeGenCUDA/device-stub.cu

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td 
b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
index 5932db8323196..a5374f4ffd79b 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
@@ -50,5 +50,22 @@ def CUDAExternallyInitializedAttr : 
CIR_Attr<"CUDAExternallyInitialized",
   }];
   let canHaveIllegalCXXABIType = 0;
 }
+def CIR_CUDABinaryHandleAttr : CIR_Attr<
+  "CUDABinaryHandle", "cu.binary_handle"
+> {
+  let summary = "Fat binary handle for device code.";
+  let description =
+  [{
+    This attribute is attached to the ModuleOp and records the binary file
+    name passed to host.
+
+    CUDA first compiles device-side code into a fat binary file. The file
+    name is then passed into host-side code, which is used to create a handle
+    and then generate various registration functions.
+  }];
+
+  let parameters = (ins "std::string":$name);
+  let assemblyFormat = "`<` $name `>`";
+}
 
 #endif // CLANG_CIR_DIALECT_IR_CIRCUDAATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td 
b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index f1f94c868e5b0..f14478e36f3c0 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -75,6 +75,7 @@ def CIR_Dialect : Dialect {
     static llvm::StringRef getDefaultFuncAttrsAttrName() { return 
"default_func_attrs"; }
     static llvm::StringRef getResAttrsAttrName() { return "res_attrs"; }
     static llvm::StringRef getArgAttrsAttrName() { return "arg_attrs"; }
+    static llvm::StringRef getCUDABinaryHandleAttrName() { return 
"cir.cu.binary_handle"; }
 
     static llvm::StringRef getAMDGPUCodeObjectVersionAttrName() { return 
"cir.amdhsa_code_object_version"; }
     static llvm::StringRef getAMDGPUPrintfKindAttrName() { return 
"cir.amdgpu_printf_kind"; }
diff --git a/clang/include/clang/CIR/MissingFeatures.h 
b/clang/include/clang/CIR/MissingFeatures.h
index b9a6b83daa13c..ac02433fb504a 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -244,6 +244,8 @@ struct MissingFeatures {
   static bool ctorConstLvalueToRvalueConversion() { return false; }
   static bool ctorMemcpyizer() { return false; }
   static bool cudaSupport() { return false; }
+  static bool hipModuleCtor() { return false; }
+  static bool globalRegistration() { return false; }
   static bool dataLayoutTypeIsSized() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool dataLayoutTypeStoreSize() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e0681eb760249..4877781b89ad5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -162,6 +162,16 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
                                                 /*line=*/0,
                                                 /*column=*/0));
   }
+
+  // Set CUDA GPU binary handle.
+  if (langOpts.CUDA) {
+    std::string cudaBinaryName = codeGenOpts.CudaGpuBinaryFileName;
+    if (!cudaBinaryName.empty()) {
+      theModule->setAttr(
+          cir::CIRDialect::getCUDABinaryHandleAttrName(),
+          cir::CUDABinaryHandleAttr::get(&mlirContext, cudaBinaryName));
+    }
+  }
 }
 
 CIRGenModule::~CIRGenModule() = default;
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 5a8e3be51a947..c4996efc4b50d 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/IRMapping.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/Attrs.inc"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/Specifiers.h"
@@ -108,6 +109,17 @@ struct LoweringPreparePass
       cir::GlobalLinkageKind linkage = cir::GlobalLinkageKind::ExternalLinkage,
       cir::VisibilityKind visibility = cir::VisibilityKind::Default);
 
+
+  /// ------------
+  /// CUDA registration related
+  /// ------------
+
+  llvm::StringMap<FuncOp> cudaKernelMap;
+
+  /// Build the CUDA module constructor that registers the fat binary
+  /// with the CUDA runtime.
+  void buildCUDAModuleCtor();
+
   /// Handle static local variable initialization with guard variables.
   void handleStaticLocal(cir::GlobalOp globalOp, cir::GetGlobalOp getGlobalOp);
 
@@ -1641,11 +1653,150 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) 
{
       globalCtorList.emplace_back(fnOp.getName(), globalCtor.value());
     else if (auto globalDtor = fnOp.getGlobalDtorPriority())
       globalDtorList.emplace_back(fnOp.getName(), globalDtor.value());
+
+    if (auto attr = fnOp->getAttr(cir::CUDAKernelNameAttr::getMnemonic())) {
+      auto kernelNameAttr = dyn_cast<CUDAKernelNameAttr>(attr);
+      std::string kernelName = kernelNameAttr.getKernelName();
+      cudaKernelMap[kernelName] = fnOp;
+    }
   } else if (auto threeWayCmp = dyn_cast<cir::CmpThreeWayOp>(op)) {
     lowerThreeWayCmpOp(threeWayCmp);
   }
 }
 
+static std::string getCUDAPrefix(clang::ASTContext *astCtx) {
+  if (astCtx->getLangOpts().HIP)
+    return "hip";
+  return "cuda";
+}
+
+static std::string addUnderscoredPrefix(llvm::StringRef prefix,
+                                        llvm::StringRef name) {
+  return ("__" + prefix + name).str();
+}
+
+/// Creates a global constructor function for the module:
+///
+/// For CUDA:
+/// \code
+/// void __cuda_module_ctor() {
+///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
+///     __cuda_register_globals(Handle);
+/// }
+/// \endcode
+///
+/// For HIP:
+/// \code
+/// void __hip_module_ctor() {
+///     if (__hip_gpubin_handle == 0) {
+///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
+///         __hip_register_globals(__hip_gpubin_handle);
+///     }
+/// }
+/// \endcode
+void LoweringPreparePass::buildCUDAModuleCtor() {
+  bool isHIP = astCtx->getLangOpts().HIP;
+
+  if (isHIP)
+    assert(!cir::MissingFeatures::hipModuleCtor());
+  if (astCtx->getLangOpts().GPURelocatableDeviceCode)
+    llvm_unreachable("GPU RDC NYI");
+
+  // For CUDA without -fgpu-rdc, it's safe to stop generating ctor
+  // if there's nothing to register.
+  if (cudaKernelMap.empty())
+    return;
+
+  // There's no device-side binary, so no need to proceed for CUDA.
+  // HIP has to create an external symbol in this case, which is NYI.
+  mlir::Attribute cudaBinaryHandleAttr =
+      mlirModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName());
+  if (!cudaBinaryHandleAttr) {
+    if (astCtx->getLangOpts().HIP)
+      assert(!cir::MissingFeatures::hipModuleCtor());
+    return;
+  }
+
+  std::string cudaGPUBinaryName =
+      mlir::cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
+
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> gpuBinaryOrErr =
+      llvm::MemoryBuffer::getFile(cudaGPUBinaryName);
+  if (std::error_code ec = gpuBinaryOrErr.getError()) {
+    mlirModule->emitError("cannot open GPU binary file: " + cudaGPUBinaryName +
+                          ": " + ec.message());
+    return;
+  }
+  std::unique_ptr<llvm::MemoryBuffer> gpuBinary =
+      std::move(gpuBinaryOrErr.get());
+
+  // Set up common types and builder.
+  std::string cudaPrefix = getCUDAPrefix(astCtx);
+  mlir::Location loc = mlirModule->getLoc();
+  CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(mlirModule.getBody());
+
+  auto voidTy = builder.getVoidTy();
+  auto voidPtrTy = builder.getVoidPtrTy();
+  auto voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
+  auto intTy = builder.getSIntNTy(32);
+  auto charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(),
+                                  /*isSigned=*/false);
+
+  // --- Create fatbin globals ---
+
+  // Create the fatbin string constant with GPU binary contents.
+  auto fatbinType =
+      ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
+  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  GlobalOp fatbinStr =
+      GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
+                       /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+  fatbinStr.setAlignment(8);
+  fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
+      fatbinType, builder.getStringAttr(gpuBinary->getBuffer())));
+  assert(!cir::MissingFeatures::opGlobalSection());
+  fatbinStr.setPrivate();
+
+  // Create the fatbin wrapper struct:
+  //    struct { int magic; int version; void *fatbin; void *unused; };
+  auto fatbinWrapperType = RecordType::get(
+      &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
+      /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
+  std::string fatbinWrapperName =
+      addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
+  GlobalOp fatbinWrapper =
+      GlobalOp::create(builder, loc, fatbinWrapperName, fatbinWrapperType,
+                       /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+
+  constexpr unsigned cudaFatMagic = 0x466243b1;
+  constexpr unsigned hipFatMagic = 0x48495046;
+  unsigned fatMagic = isHIP ? hipFatMagic : cudaFatMagic;
+
+  auto magicInit = IntAttr::get(intTy, fatMagic);
+  auto versionInit = IntAttr::get(intTy, 1);
+  auto fatbinStrSymbol =
+      mlir::FlatSymbolRefAttr::get(fatbinStr.getSymNameAttr());
+  auto fatbinInit = GlobalViewAttr::get(voidPtrTy, fatbinStrSymbol);
+  auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
+  fatbinWrapper.setInitialValueAttr(cir::ConstRecordAttr::get(
+      fatbinWrapperType,
+      mlir::ArrayAttr::get(&getContext(),
+                           {magicInit, versionInit, fatbinInit, unusedInit})));
+
+  // Create the GPU binary handle global variable.
+  std::string gpubinHandleName =
+      addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
+  GlobalOp gpuBinHandle = GlobalOp::create(
+      builder, loc, gpubinHandleName, voidPtrPtrTy,
+      /*isConstant=*/false, GlobalLinkageKind::InternalLinkage);
+  gpuBinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
+  gpuBinHandle.setPrivate();
+
+  // TODO: ctor/dtor/register_globals
+  assert(!cir::MissingFeatures::globalRegistration());
+}
+
 void LoweringPreparePass::runOnOperation() {
   mlir::Operation *op = getOperation();
   if (isa<::mlir::ModuleOp>(op))
@@ -1666,6 +1817,9 @@ void LoweringPreparePass::runOnOperation() {
     runOnOp(o);
 
   buildCXXGlobalInitFunc();
+  if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice)
+    buildCUDAModuleCtor();
+
   buildGlobalCtorDtorList();
 }
 
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu 
b/clang/test/CIR/CodeGenCUDA/device-stub.cu
new file mode 100644
index 0000000000000..59bfd5b31d522
--- /dev/null
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -0,0 +1,50 @@
+// Based on clang/test/CodeGenCUDA/device-stub.cu (incubator).
+
+// Create a dummy GPU binary file for registration.
+// RUN: echo -n "GPU binary would be here." > %t
+
+// CIR output — check fatbin globals are created correctly.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
+// RUN:   -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+
+// OGCG output — check LLVM IR parity with original codegen.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -x cuda \
+// RUN:   -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// No GPU binary — nothing should be generated.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
+// RUN:   -target-sdk-version=12.3 -o %t.nogpu.cir
+// RUN: FileCheck --input-file=%t.nogpu.cir %s --check-prefix=NOGPUBIN
+
+#include "Inputs/cuda.h"
+
+__global__ void kernelfunc(int i, int j, int k) {}
+
+void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
+
+// Check the fatbin string constant with GPU binary contents.
+// CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = 
#cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> 
{alignment = 8 : i64}
+
+// Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }.
+// CIR: cir.global constant cir_private @__cuda_fatbin_wrapper = 
#cir.const_record<{
+// CIR-SAME: #cir.int<1180844977> : !s32i,
+// CIR-SAME: #cir.int<1> : !s32i,
+// CIR-SAME: #cir.global_view<@__cuda_fatbin_str> : !cir.ptr<!void>,
+// CIR-SAME: #cir.ptr<null> : !cir.ptr<!void>
+// CIR-SAME: }>
+
+// Check the GPU binary handle global.
+// CIR: cir.global "private" internal @__cuda_gpubin_handle = #cir.ptr<null> : 
!cir.ptr<!cir.ptr<!void>>
+
+// OGCG: constant [25 x i8] c"GPU binary would be here.", section 
".nv_fatbin", align 8
+// OGCG: @__cuda_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { 
i32 1180844977, i32 1, ptr @{{.*}}, ptr null }, section ".nvFatBinSegment"
+// OGCG: @__cuda_gpubin_handle = internal global ptr null
+
+// No GPU binary — no registration infrastructure at all.
+// NOGPUBIN-NOT: fatbin
+// NOGPUBIN-NOT: gpubin
+// NOGPUBIN-NOT: __cuda_register_globals
+// NOGPUBIN-NOT: __cuda_module_ctor
+// NOGPUBIN-NOT: __cuda_module_dtor

>From 74ff77f84666fb4f48644775c11feb23b090bae4 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Mon, 23 Mar 2026 20:19:09 -0400
Subject: [PATCH 02/11] fix tests and remove unnecessary comments.

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 -
 clang/test/CIR/CodeGenCUDA/device-stub.cu            | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index c4996efc4b50d..05574b9e30733 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -109,7 +109,6 @@ struct LoweringPreparePass
       cir::GlobalLinkageKind linkage = cir::GlobalLinkageKind::ExternalLinkage,
       cir::VisibilityKind visibility = cir::VisibilityKind::Default);
 
-
   /// ------------
   /// CUDA registration related
   /// ------------
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu 
b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index 59bfd5b31d522..b3e8baa17c7a4 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -3,17 +3,14 @@
 // Create a dummy GPU binary file for registration.
 // RUN: echo -n "GPU binary would be here." > %t
 
-// CIR output — check fatbin globals are created correctly.
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
 // RUN:   -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.cir
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
 
-// OGCG output — check LLVM IR parity with original codegen.
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -x cuda \
 // RUN:   -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.ll
 // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
-// No GPU binary — nothing should be generated.
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
 // RUN:   -target-sdk-version=12.3 -o %t.nogpu.cir
 // RUN: FileCheck --input-file=%t.nogpu.cir %s --check-prefix=NOGPUBIN

>From 0c6b8530013334367876381a4f6614c409ab21ab Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Mon, 23 Mar 2026 20:46:17 -0400
Subject: [PATCH 03/11] fix global builder ordering

---
 .../CIR/Dialect/Transforms/LoweringPrepare.cpp    | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 05574b9e30733..5b241e2bcbad0 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1748,9 +1748,9 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
   std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
-  GlobalOp fatbinStr =
-      GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
-                       /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+  GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, 
fatbinType,
+                                        /*isConstant=*/true, {},
+                                        GlobalLinkageKind::PrivateLinkage);
   fatbinStr.setAlignment(8);
   fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
       fatbinType, builder.getStringAttr(gpuBinary->getBuffer())));
@@ -1764,9 +1764,9 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
   std::string fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
-  GlobalOp fatbinWrapper =
-      GlobalOp::create(builder, loc, fatbinWrapperName, fatbinWrapperType,
-                       /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+  GlobalOp fatbinWrapper = GlobalOp::create(
+      builder, loc, fatbinWrapperName, fatbinWrapperType,
+      /*isConstant=*/true, {}, GlobalLinkageKind::PrivateLinkage);
 
   constexpr unsigned cudaFatMagic = 0x466243b1;
   constexpr unsigned hipFatMagic = 0x48495046;
@@ -1786,9 +1786,10 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   // Create the GPU binary handle global variable.
   std::string gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
+
   GlobalOp gpuBinHandle = GlobalOp::create(
       builder, loc, gpubinHandleName, voidPtrPtrTy,
-      /*isConstant=*/false, GlobalLinkageKind::InternalLinkage);
+      /*isConstant=*/false, {}, cir::GlobalLinkageKind::InternalLinkage);
   gpuBinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
   gpuBinHandle.setPrivate();
 

>From 0fff8db0dcfa23a73bff71cedaf8f415466b6597 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Wed, 25 Mar 2026 23:14:58 -0400
Subject: [PATCH 04/11] Avoid copies from `std::string`

---
 .../clang/CIR/Dialect/IR/CIRCUDAAttrs.td      |  2 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 11 +++--
 .../Dialect/Transforms/LoweringPrepare.cpp    | 40 +++++++++++--------
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td 
b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
index a5374f4ffd79b..d9aabd602a279 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
@@ -64,7 +64,7 @@ def CIR_CUDABinaryHandleAttr : CIR_Attr<
     and then generate various registration functions.
   }];
 
-  let parameters = (ins "std::string":$name);
+  let parameters = (ins "mlir::StringAttr":$name);
   let assemblyFormat = "`<` $name `>`";
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 4877781b89ad5..2d27aff2cc2c8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -32,10 +32,12 @@
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/Interfaces/CIROpInterfaces.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/ADT/StringRef.h"
 
 #include "CIRGenFunctionInfo.h"
 #include "TargetInfo.h"
 #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
@@ -165,11 +167,12 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
 
   // Set CUDA GPU binary handle.
   if (langOpts.CUDA) {
-    std::string cudaBinaryName = codeGenOpts.CudaGpuBinaryFileName;
+    llvm::StringRef cudaBinaryName = codeGenOpts.CudaGpuBinaryFileName;
     if (!cudaBinaryName.empty()) {
-      theModule->setAttr(
-          cir::CIRDialect::getCUDABinaryHandleAttrName(),
-          cir::CUDABinaryHandleAttr::get(&mlirContext, cudaBinaryName));
+      theModule->setAttr(cir::CIRDialect::getCUDABinaryHandleAttrName(),
+                         cir::CUDABinaryHandleAttr::get(
+                             &mlirContext, mlir::StringAttr::get(
+                                               &mlirContext, cudaBinaryName)));
     }
   }
 }
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 5b241e2bcbad0..a7162eb86b511 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -9,6 +9,7 @@
 #include "PassDetail.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attrs.inc"
 #include "clang/AST/Mangle.h"
@@ -21,9 +22,11 @@
 #include "clang/CIR/Dialect/IR/CIRDataLayout.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIROpsEnums.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/Dialect/Passes.h"
 #include "clang/CIR/Interfaces/ASTAttrInterfaces.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Path.h"
 
@@ -1653,7 +1656,7 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
     else if (auto globalDtor = fnOp.getGlobalDtorPriority())
       globalDtorList.emplace_back(fnOp.getName(), globalDtor.value());
 
-    if (auto attr = fnOp->getAttr(cir::CUDAKernelNameAttr::getMnemonic())) {
+    if (mlir::Attribute attr = 
fnOp->getAttr(cir::CUDAKernelNameAttr::getMnemonic())) {
       auto kernelNameAttr = dyn_cast<CUDAKernelNameAttr>(attr);
       std::string kernelName = kernelNameAttr.getKernelName();
       cudaKernelMap[kernelName] = fnOp;
@@ -1663,15 +1666,15 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
   }
 }
 
-static std::string getCUDAPrefix(clang::ASTContext *astCtx) {
+static llvm::StringRef getCUDAPrefix(clang::ASTContext *astCtx) {
   if (astCtx->getLangOpts().HIP)
     return "hip";
   return "cuda";
 }
 
-static std::string addUnderscoredPrefix(llvm::StringRef prefix,
+static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix,
                                         llvm::StringRef name) {
-  return ("__" + prefix + name).str();
+  return ("__" + prefix + name).getSingleStringRef();
 }
 
 /// Creates a global constructor function for the module:
@@ -1716,8 +1719,10 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
     return;
   }
 
-  std::string cudaGPUBinaryName =
-      mlir::cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
+  llvm::StringRef cudaGPUBinaryName =
+      mlir::cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr)
+          .getName()
+          .getValue();
 
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> gpuBinaryOrErr =
       llvm::MemoryBuffer::getFile(cudaGPUBinaryName);
@@ -1730,24 +1735,25 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
       std::move(gpuBinaryOrErr.get());
 
   // Set up common types and builder.
-  std::string cudaPrefix = getCUDAPrefix(astCtx);
+  llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx);
   mlir::Location loc = mlirModule->getLoc();
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointToStart(mlirModule.getBody());
 
-  auto voidTy = builder.getVoidTy();
-  auto voidPtrTy = builder.getVoidPtrTy();
-  auto voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
-  auto intTy = builder.getSIntNTy(32);
-  auto charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(),
-                                  /*isSigned=*/false);
+  VoidType voidTy = builder.getVoidTy();
+  PointerType voidPtrTy = builder.getVoidPtrTy();
+  PointerType voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
+  IntType intTy = builder.getSIntNTy(32);
+  IntType charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(),
+                                     /*isSigned=*/false);
 
   // --- Create fatbin globals ---
 
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
-  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  llvm::StringRef fatbinStrName =
+      addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, 
fatbinType,
                                         /*isConstant=*/true, {},
                                         GlobalLinkageKind::PrivateLinkage);
@@ -1762,7 +1768,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinWrapperType = RecordType::get(
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
-  std::string fatbinWrapperName =
+  llvm::StringRef fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
@@ -1777,14 +1783,14 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinStrSymbol =
       mlir::FlatSymbolRefAttr::get(fatbinStr.getSymNameAttr());
   auto fatbinInit = GlobalViewAttr::get(voidPtrTy, fatbinStrSymbol);
-  auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
+  mlir::TypedAttr unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
   fatbinWrapper.setInitialValueAttr(cir::ConstRecordAttr::get(
       fatbinWrapperType,
       mlir::ArrayAttr::get(&getContext(),
                            {magicInit, versionInit, fatbinInit, unusedInit})));
 
   // Create the GPU binary handle global variable.
-  std::string gpubinHandleName =
+  llvm::StringRef gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
 
   GlobalOp gpuBinHandle = GlobalOp::create(

>From 9930dd3e867cb32c1edb08cacc1db403e047130b Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 26 Mar 2026 01:09:34 -0400
Subject: [PATCH 05/11] address more string copies stuff yo

---
 clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td   | 2 +-
 clang/lib/CIR/CodeGen/CIRGenCall.cpp                 | 6 ++++--
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 7 ++++---
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td 
b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
index d9aabd602a279..8341819e84c62 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
@@ -32,7 +32,7 @@ def CIR_CUDAKernelNameAttr : CIR_Attr<"CUDAKernelName", 
"cu.kernel_name"> {
     respective function runtime registration on the host.
   }];
 
-  let parameters = (ins "std::string":$kernel_name);
+  let parameters = (ins "mlir::StringAttr":$kernel_name);
   let assemblyFormat = "`<` $kernel_name `>`";
   let canHaveIllegalCXXABIType = 0;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp 
b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 35479fa8097ce..800343f4336b6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -16,6 +16,7 @@
 #include "CIRGenFunction.h"
 #include "CIRGenFunctionInfo.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
 #include "clang/CIR/ABIArgInfo.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/TypeSize.h"
@@ -431,8 +432,9 @@ void CIRGenModule::constructAttributeList(
       GlobalDecl kernel(calleeInfo.getCalleeDecl());
       llvm::StringRef kernelName = getMangledName(
           kernel.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
-      auto attr =
-          cir::CUDAKernelNameAttr::get(&getMLIRContext(), kernelName.str());
+      auto attr = cir::CUDAKernelNameAttr::get(
+          &getMLIRContext(),
+          mlir::StringAttr::get(&getMLIRContext(), kernelName));
       attrs.set(attr.getMnemonic(), attr);
     }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index a7162eb86b511..94b73915d7758 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1656,9 +1656,10 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
     else if (auto globalDtor = fnOp.getGlobalDtorPriority())
       globalDtorList.emplace_back(fnOp.getName(), globalDtor.value());
 
-    if (mlir::Attribute attr = 
fnOp->getAttr(cir::CUDAKernelNameAttr::getMnemonic())) {
+    if (mlir::Attribute attr =
+            fnOp->getAttr(cir::CUDAKernelNameAttr::getMnemonic())) {
       auto kernelNameAttr = dyn_cast<CUDAKernelNameAttr>(attr);
-      std::string kernelName = kernelNameAttr.getKernelName();
+      llvm::StringRef kernelName = kernelNameAttr.getKernelName();
       cudaKernelMap[kernelName] = fnOp;
     }
   } else if (auto threeWayCmp = dyn_cast<cir::CmpThreeWayOp>(op)) {
@@ -1714,7 +1715,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   mlir::Attribute cudaBinaryHandleAttr =
       mlirModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName());
   if (!cudaBinaryHandleAttr) {
-    if (astCtx->getLangOpts().HIP)
+    if (isHIP)
       assert(!cir::MissingFeatures::hipModuleCtor());
     return;
   }

>From 67c07153db68f381dbc88d851641d10431c292fd Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Fri, 27 Mar 2026 03:38:57 -0400
Subject: [PATCH 06/11] fix twine crashes

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 94b73915d7758..0fd10567146ec 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1673,9 +1673,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext 
*astCtx) {
   return "cuda";
 }
 
-static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix,
+static std::string addUnderscoredPrefix(llvm::StringRef prefix,
                                         llvm::StringRef name) {
-  return ("__" + prefix + name).getSingleStringRef();
+  return ("__" + prefix + name).str();
 }
 
 /// Creates a global constructor function for the module:
@@ -1753,8 +1753,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
-  llvm::StringRef fatbinStrName =
-      addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, 
fatbinType,
                                         /*isConstant=*/true, {},
                                         GlobalLinkageKind::PrivateLinkage);
@@ -1769,7 +1768,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinWrapperType = RecordType::get(
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
-  llvm::StringRef fatbinWrapperName =
+  std::string fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
@@ -1791,7 +1790,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
                            {magicInit, versionInit, fatbinInit, unusedInit})));
 
   // Create the GPU binary handle global variable.
-  llvm::StringRef gpubinHandleName =
+  std::string gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
 
   GlobalOp gpuBinHandle = GlobalOp::create(

>From 3b1cadf1980d50878eb0b9b10b4d311af53a5d1d Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 13:45:13 -0400
Subject: [PATCH 07/11] fix fmt

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 0fd10567146ec..7135683ecbb3b 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -8,8 +8,8 @@
 
 #include "PassDetail.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/IRMapping.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attrs.inc"
 #include "clang/AST/Mangle.h"

>From beb2302b6bdfe37e8ee9b37f991cdc3b654a1e3a Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 15:58:55 -0400
Subject: [PATCH 08/11] Fix conflicts and add section to fatbin globals

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 10 +++++++++-
 clang/test/CIR/CodeGenCUDA/device-stub.cu            |  7 +++----
 clang/test/CIR/CodeGenCUDA/kernel-call.cu            |  4 ++--
 clang/test/CIR/CodeGenCUDA/kernel-stub-name.cu       |  6 +++---
 clang/test/CIR/CodeGenHIP/simple.cpp                 |  2 +-
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 7135683ecbb3b..7ba81856f8eb2 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1750,6 +1750,13 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
 
   // --- Create fatbin globals ---
 
+  // The section names are different for MAC OS X.
+  llvm::StringRef fatbinConstName =
+      astCtx->getLangOpts().HIP ? ".hip_fatbin" : ".nv_fatbin";
+
+  llvm::StringRef fatbinSectionName =
+      astCtx->getLangOpts().HIP ? ".hipFatBinSegment" : ".nvFatBinSegment";
+
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
@@ -1760,7 +1767,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   fatbinStr.setAlignment(8);
   fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
       fatbinType, builder.getStringAttr(gpuBinary->getBuffer())));
-  assert(!cir::MissingFeatures::opGlobalSection());
+  fatbinStr.setSection(fatbinConstName);
   fatbinStr.setPrivate();
 
   // Create the fatbin wrapper struct:
@@ -1773,6 +1780,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
       /*isConstant=*/true, {}, GlobalLinkageKind::PrivateLinkage);
+  fatbinWrapper.setSection(fatbinSectionName);
 
   constexpr unsigned cudaFatMagic = 0x466243b1;
   constexpr unsigned hipFatMagic = 0x48495046;
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu 
b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index b3e8baa17c7a4..2e9deaee9b225 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -21,16 +21,15 @@ __global__ void kernelfunc(int i, int j, int k) {}
 
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 
-// Check the fatbin string constant with GPU binary contents.
-// CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = 
#cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> 
{alignment = 8 : i64}
+// CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = 
#cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> 
{alignment = 8 : i64, section = ".nv_fatbin"}
 
-// Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }.
+// Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }, 
with section.
 // CIR: cir.global constant cir_private @__cuda_fatbin_wrapper = 
#cir.const_record<{
 // CIR-SAME: #cir.int<1180844977> : !s32i,
 // CIR-SAME: #cir.int<1> : !s32i,
 // CIR-SAME: #cir.global_view<@__cuda_fatbin_str> : !cir.ptr<!void>,
 // CIR-SAME: #cir.ptr<null> : !cir.ptr<!void>
-// CIR-SAME: }>
+// CIR-SAME: }> : !rec_anon_struct {section = ".nvFatBinSegment"}
 
 // Check the GPU binary handle global.
 // CIR: cir.global "private" internal @__cuda_gpubin_handle = #cir.ptr<null> : 
!cir.ptr<!cir.ptr<!void>>
diff --git a/clang/test/CIR/CodeGenCUDA/kernel-call.cu 
b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
index 2d37b6eef73af..3d153f581cec7 100644
--- a/clang/test/CIR/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
@@ -106,13 +106,13 @@ int main(void) {
   // CUDA-NEW: } else {
   // CUDA-NEW:   cir.const #cir.int<42> : !s32i
   // CUDA-NEW:   cir.const #cir.fp<1.000000e+00> : !cir.float
-  // CUDA-NEW:   cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name 
= #cir.cu.kernel_name<_Z6kernelif>} : (!s32i {llvm.noundef}, !cir.float 
{llvm.noundef}) -> ()
+  // CUDA-NEW:   cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name 
= #cir.cu.kernel_name<"_Z6kernelif">} : (!s32i {llvm.noundef}, !cir.float 
{llvm.noundef}) -> ()
   // CUDA-NEW: }
   // HIP-NEW: cir.if %{{.*}} {
   // HIP-NEW: } else {
   // HIP-NEW:   cir.const #cir.int<42> : !s32i
   // HIP-NEW:   cir.const #cir.fp<1.000000e+00> : !cir.float
-  // HIP-NEW:   cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name 
= #cir.cu.kernel_name<_Z6kernelif>} : (!s32i {llvm.noundef}, !cir.float 
{llvm.noundef}) -> ()
+  // HIP-NEW:   cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name 
= #cir.cu.kernel_name<"_Z6kernelif">} : (!s32i {llvm.noundef}, !cir.float 
{llvm.noundef}) -> ()
   // HIP-NEW: }
   kernel<<<1, 1>>>(42, 1.0f);
 }
diff --git a/clang/test/CIR/CodeGenCUDA/kernel-stub-name.cu 
b/clang/test/CIR/CodeGenCUDA/kernel-stub-name.cu
index 1a874d9e9fada..42c8f10430b1f 100644
--- a/clang/test/CIR/CodeGenCUDA/kernel-stub-name.cu
+++ b/clang/test/CIR/CodeGenCUDA/kernel-stub-name.cu
@@ -6,17 +6,17 @@
 
 #include "Inputs/cuda.h"
 
-// CHECK: cir.func {{.*}} @[[CSTUB:__device_stub__ckernel]]() attributes 
{cu.kernel_name = #cir.cu.kernel_name<ckernel>{{.*}}}
+// CHECK: cir.func {{.*}} @[[CSTUB:__device_stub__ckernel]]() attributes 
{cu.kernel_name = #cir.cu.kernel_name<"ckernel">{{.*}}}
 // CHECK: cir.return
 // CHECK-NEXT: }
 extern "C" __global__ void ckernel() {}
 
-// CHECK: cir.func {{.*}} @_ZN2ns23__device_stub__nskernelEv() attributes 
{cu.kernel_name = #cir.cu.kernel_name<_ZN2ns8nskernelEv>{{.*}}}
+// CHECK: cir.func {{.*}} @_ZN2ns23__device_stub__nskernelEv() attributes 
{cu.kernel_name = #cir.cu.kernel_name<"_ZN2ns8nskernelEv">{{.*}}}
 namespace ns {
 __global__ void nskernel() {}
 } // namespace ns
 
-// CHECK: cir.func {{.*}} @_Z25__device_stub__kernelfuncIiEvv() attributes 
{cu.kernel_name = #cir.cu.kernel_name<_Z10kernelfuncIiEvv>{{.*}}}
+// CHECK: cir.func {{.*}} @_Z25__device_stub__kernelfuncIiEvv() attributes 
{cu.kernel_name = #cir.cu.kernel_name<"_Z10kernelfuncIiEvv">{{.*}}}
 template <class T>
 __global__ void kernelfunc() {}
 template __global__ void kernelfunc<int>();
diff --git a/clang/test/CIR/CodeGenHIP/simple.cpp 
b/clang/test/CIR/CodeGenHIP/simple.cpp
index 15240fd7a3038..b3df34aed6afb 100644
--- a/clang/test/CIR/CodeGenHIP/simple.cpp
+++ b/clang/test/CIR/CodeGenHIP/simple.cpp
@@ -42,7 +42,7 @@ __global__ void global_fn(int a) {}
 // CIR-DEVICE: cir.func {{.*}}{{.*}} @_Z9global_fni
 // OGCG-DEVICE: define protected amdgpu_kernel void @_Z9global_fni
 
-// CIR-HOST: @_Z24__device_stub__global_fni{{.*}}attributes {cu.kernel_name = 
#cir.cu.kernel_name<_Z9global_fni>{{.*}}}
+// CIR-HOST: @_Z24__device_stub__global_fni{{.*}}attributes {cu.kernel_name = 
#cir.cu.kernel_name<"_Z9global_fni">{{.*}}}
 // CIR-HOST: %[[#CIRKernelArgs:]] = cir.alloca {{.*}}"kernel_args"
 // CIR-HOST: %[[#Decayed:]] = cir.cast array_to_ptrdecay %[[#CIRKernelArgs]]
 // CIR-HOST: cir.call @__hipPopCallConfiguration

>From 9ede868137d722f87b0903e37fe2967840b81ead Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 17:19:29 -0400
Subject: [PATCH 09/11] remove accidental .inc include

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 7ba81856f8eb2..8943a63b60536 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -11,7 +11,6 @@
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/Attrs.inc"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/Specifiers.h"

>From 08764284a1c22bd0a7f83abbc03d1d7ffd74d00d Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 17:31:27 -0400
Subject: [PATCH 10/11] Fix missing include for memoryBuffer on linux ci

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 8943a63b60536..b5140c281ed2d 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -27,6 +27,7 @@
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 
 #include <memory>

>From f41fc9f0f01be7eca3000ec69b406002c0fddfe7 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 17:47:02 -0400
Subject: [PATCH 11/11] remove unused var

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index b5140c281ed2d..8185385f92b50 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1741,7 +1741,6 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointToStart(mlirModule.getBody());
 
-  VoidType voidTy = builder.getVoidTy();
   PointerType voidPtrTy = builder.getVoidPtrTy();
   PointerType voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
   IntType intTy = builder.getSIntNTy(32);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to