[clang] a35b594 - [CIR][CUDA][HIP] Add NVPTX target info and CUDA/HIP global emission filtering (#177827)

via cfe-commits Tue, 03 Feb 2026 03:18:52 -0800

Author: David Rivera
Date: 2026-02-03T06:18:39-05:00
New Revision: a35b5940b44e4e3b9b1be98ca1b3e0b2743943ff


URL: 
https://github.com/llvm/llvm-project/commit/a35b5940b44e4e3b9b1be98ca1b3e0b2743943ff
DIFF: 
https://github.com/llvm/llvm-project/commit/a35b5940b44e4e3b9b1be98ca1b3e0b2743943ff.diff

LOG: [CIR][CUDA][HIP] Add NVPTX target info and CUDA/HIP global emission 
filtering (#177827)

related: #175871 

This patch adds foundational infra for device-side CUDA/HIP compilation
by introducing NVPTX target info and implementing the global emission
filtering logic.


  NVPTX Target Info to allows us to compile against that triple:
  - Add NVPTXABIInfo and NVPTXTargetCIRGenInfo classes
  - Wire up nvptx and nvptx64 triples in getTargetCIRGenInfo()
  - Add createNVPTXTargetCIRGenInfo() factory function

CUDA/HIP Global Emission Filtering (most of this is boilerplate from the
AST) This basically narrows down to:
- Skip host-only functions (no `__device__` attribute) when
`-fcuda-is-device`
   - Skip device-only functions (device without  host) on host side
- Always emit ` __global__` kernels and `__host__` `__device__`
functions on both sides
- Add `shouldEmitCUDAGlobalVar()` to handle variable emission
(device/constant/shared variables)
- Handle special cases: implicit host/device templates, lambda call
operators

Added: 
    clang/test/CIR/CodeGenCUDA/filter-decl.cu
    clang/test/CIR/CodeGenCUDA/nvptx-basic.cu

Modified: 
    clang/lib/CIR/CodeGen/CIRGenModule.cpp
    clang/lib/CIR/CodeGen/CIRGenModule.h
    clang/lib/CIR/CodeGen/TargetInfo.cpp
    clang/lib/CIR/CodeGen/TargetInfo.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index fd99a47b8b445..508e941517c3d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -17,6 +17,7 @@
 #include "CIRGenFunction.h"
 
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclOpenACC.h"
 #include "clang/AST/GlobalDecl.h"
@@ -29,6 +30,7 @@
 #include "clang/CIR/MissingFeatures.h"
 
 #include "CIRGenFunctionInfo.h"
+#include "TargetInfo.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
@@ -252,6 +254,10 @@ const TargetCIRGenInfo 
&CIRGenModule::getTargetCIRGenInfo() {
       return *theTargetCIRGenInfo;
     }
   }
+  case llvm::Triple::nvptx:
+  case llvm::Triple::nvptx64:
+    theTargetCIRGenInfo = createNVPTXTargetCIRGenInfo(genTypes);
+    return *theTargetCIRGenInfo;
   }
 }
 
@@ -368,6 +374,28 @@ void CIRGenModule::emitDeferred() {
   }
 }
 
+template <typename AttrT> static bool hasImplicitAttr(const ValueDecl *decl) {
+  if (!decl)
+    return false;
+  if (auto *attr = decl->getAttr<AttrT>())
+    return attr->isImplicit();
+  return decl->isImplicit();
+}
+
+// TODO(cir): This should be shared with OG Codegen.
+bool CIRGenModule::shouldEmitCUDAGlobalVar(const VarDecl *global) const {
+  assert(langOpts.CUDA && "Should not be called by non-CUDA languages");
+  // We need to emit host-side 'shadows' for all global
+  // device-side variables because the CUDA runtime needs their
+  // size and host-side address in order to provide access to
+  // their device-side incarnations.
+  return !langOpts.CUDAIsDevice || global->hasAttr<CUDADeviceAttr>() ||
+         global->hasAttr<CUDAConstantAttr>() ||
+         global->hasAttr<CUDASharedAttr>() ||
+         global->getType()->isCUDADeviceBuiltinSurfaceType() ||
+         global->getType()->isCUDADeviceBuiltinTextureType();
+}
+
 void CIRGenModule::emitGlobal(clang::GlobalDecl gd) {
   if (const auto *cd = dyn_cast<clang::OpenACCConstructDecl>(gd.getDecl())) {
     emitGlobalOpenACCDecl(cd);
@@ -382,6 +410,36 @@ void CIRGenModule::emitGlobal(clang::GlobalDecl gd) {
 
   const auto *global = cast<ValueDecl>(gd.getDecl());
 
+  // If this is CUDA, be selective about which declarations we emit.
+  // Non-constexpr non-lambda implicit host device functions are not emitted
+  // unless they are used on device side.
+  if (langOpts.CUDA) {
+    assert((isa<FunctionDecl>(global) || isa<VarDecl>(global)) &&
+           "Expected Variable or Function");
+    if (const auto *varDecl = dyn_cast<VarDecl>(global)) {
+      if (!shouldEmitCUDAGlobalVar(varDecl))
+        return;
+      // TODO(cir): This should be shared with OG Codegen.
+    } else if (langOpts.CUDAIsDevice) {
+      const auto *functionDecl = dyn_cast<FunctionDecl>(global);
+      if ((!global->hasAttr<CUDADeviceAttr>() ||
+           (langOpts.OffloadImplicitHostDeviceTemplates &&
+            hasImplicitAttr<CUDAHostAttr>(functionDecl) &&
+            hasImplicitAttr<CUDADeviceAttr>(functionDecl) &&
+            !functionDecl->isConstexpr() &&
+            !isLambdaCallOperator(functionDecl) &&
+            !getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(
+                functionDecl))) &&
+          !global->hasAttr<CUDAGlobalAttr>() &&
+          !(langOpts.HIPStdPar && isa<FunctionDecl>(global) &&
+            !global->hasAttr<CUDAHostAttr>()))
+        return;
+      // Device-only functions are the only things we skip.
+    } else if (!global->hasAttr<CUDAHostAttr>() &&
+               global->hasAttr<CUDADeviceAttr>())
+      return;
+  }
+
   if (const auto *fd = dyn_cast<FunctionDecl>(global)) {
     // Update deferred annotations with the latest declaration if the function
     // was already used or defined.

diff  --git a/clang/lib/CIR/CodeGen/CIRGenModule.h 
b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 46ef0ad426ae1..88b66129a8348 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -567,6 +567,10 @@ class CIRGenModule : public CIRGenTypeCache {
 
   static void setInitializer(cir::GlobalOp &op, mlir::Attribute value);
 
+  // Whether a global variable should be emitted by CUDA/HIP host/device
+  // related attributes.
+  bool shouldEmitCUDAGlobalVar(const VarDecl *global) const;
+
   void replaceUsesOfNonProtoTypeWithRealFunction(mlir::Operation *old,
                                                  cir::FuncOp newFn);
 

diff  --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp 
b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index 377c532e492d9..dc29dc0204c19 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -56,6 +56,25 @@ class X8664TargetCIRGenInfo : public TargetCIRGenInfo {
 
 } // namespace
 
+namespace {
+
+class NVPTXABIInfo : public ABIInfo {
+public:
+  NVPTXABIInfo(CIRGenTypes &cgt) : ABIInfo(cgt) {}
+};
+
+class NVPTXTargetCIRGenInfo : public TargetCIRGenInfo {
+public:
+  NVPTXTargetCIRGenInfo(CIRGenTypes &cgt)
+      : TargetCIRGenInfo(std::make_unique<NVPTXABIInfo>(cgt)) {}
+};
+} // namespace
+
+std::unique_ptr<TargetCIRGenInfo>
+clang::CIRGen::createNVPTXTargetCIRGenInfo(CIRGenTypes &cgt) {
+  return std::make_unique<NVPTXTargetCIRGenInfo>(cgt);
+}
+
 std::unique_ptr<TargetCIRGenInfo>
 clang::CIRGen::createX8664TargetCIRGenInfo(CIRGenTypes &cgt) {
   return std::make_unique<X8664TargetCIRGenInfo>(cgt);

diff  --git a/clang/lib/CIR/CodeGen/TargetInfo.h 
b/clang/lib/CIR/CodeGen/TargetInfo.h
index 9535ba94fb08b..bab838692e215 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -124,6 +124,8 @@ class TargetCIRGenInfo {
 
 std::unique_ptr<TargetCIRGenInfo> createX8664TargetCIRGenInfo(CIRGenTypes 
&cgt);
 
+std::unique_ptr<TargetCIRGenInfo> createNVPTXTargetCIRGenInfo(CIRGenTypes 
&cgt);
+
 } // namespace clang::CIRGen
 
 #endif // LLVM_CLANG_LIB_CIR_TARGETINFO_H

diff  --git a/clang/test/CIR/CodeGenCUDA/filter-decl.cu 
b/clang/test/CIR/CodeGenCUDA/filter-decl.cu
new file mode 100644
index 0000000000000..a2a7e9ba9a1b0
--- /dev/null
+++ b/clang/test/CIR/CodeGenCUDA/filter-decl.cu
@@ -0,0 +1,55 @@
+// Based on clang/test/CodeGenCUDA/filter-decl.cu tailored for CIR current 
capabilities.
+// Tests that host/device functions are emitted only on the appropriate side.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN:   -x cuda -emit-cir %s -o %t.host.cir
+// RUN: FileCheck --input-file=%t.host.cir %s --check-prefix=CIR-HOST
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -x cuda \
+// RUN:  -fcuda-is-device -emit-cir %s -o %t.device.cir
+// RUN: FileCheck --input-file=%t.device.cir %s --check-prefix=CIR-DEVICE
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN:   -x cuda -emit-llvm %s -o %t.host.ll
+// RUN: FileCheck --input-file=%t.host.ll %s --check-prefix=OGCG-HOST
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -x cuda \
+// RUN:   -fcuda-is-device -emit-llvm %s -o %t.device.ll
+// RUN: FileCheck --input-file=%t.device.ll %s --check-prefix=OGCG-DEVICE
+
+#include "Inputs/cuda.h"
+
+// Implicit host function (no attribute) — host only
+// CIR-HOST: cir.func {{.*}} @_Z20implicithostonlyfuncv()
+// CIR-DEVICE-NOT: @_Z20implicithostonlyfuncv
+// OGCG-HOST: define{{.*}} void @_Z20implicithostonlyfuncv()
+// OGCG-DEVICE-NOT: @_Z20implicithostonlyfuncv
+void implicithostonlyfunc(void) {}
+
+// Explicit __host__ function — host only
+// CIR-HOST: cir.func {{.*}} @_Z20explicithostonlyfuncv()
+// CIR-DEVICE-NOT: @_Z20explicithostonlyfuncv
+// OGCG-HOST: define{{.*}} void @_Z20explicithostonlyfuncv()
+// OGCG-DEVICE-NOT: @_Z20explicithostonlyfuncv
+__host__ void explicithostonlyfunc(void) {}
+
+// __device__ function — device only
+// CIR-HOST-NOT: @_Z14deviceonlyfuncv
+// CIR-DEVICE: cir.func {{.*}} @_Z14deviceonlyfuncv()
+// OGCG-HOST-NOT: @_Z14deviceonlyfuncv
+// OGCG-DEVICE: define{{.*}} void @_Z14deviceonlyfuncv()
+__device__ void deviceonlyfunc(void) {}
+
+// __host__ __device__ function — both sides
+// CIR-HOST: cir.func {{.*}} @_Z14hostdevicefuncv()
+// CIR-DEVICE: cir.func {{.*}} @_Z14hostdevicefuncv()
+// OGCG-HOST: define{{.*}} void @_Z14hostdevicefuncv()
+// OGCG-DEVICE: define{{.*}} void @_Z14hostdevicefuncv()
+__host__ __device__ void hostdevicefunc(void) {}
+
+// __global__ kernel — both sides (stub on host, kernel on device)
+// CIR-HOST: cir.func {{.*}} @_Z25__device_stub__globalfuncv()
+// CIR-DEVICE: cir.func {{.*}} @_Z10globalfuncv()
+// OGCG-HOST: define{{.*}} void @_Z25__device_stub__globalfuncv()
+// OGCG-DEVICE: define{{.*}} void @_Z10globalfuncv()
+__global__ void globalfunc(void) {}

diff  --git a/clang/test/CIR/CodeGenCUDA/nvptx-basic.cu 
b/clang/test/CIR/CodeGenCUDA/nvptx-basic.cu
new file mode 100644
index 0000000000000..99f0164a18506
--- /dev/null
+++ b/clang/test/CIR/CodeGenCUDA/nvptx-basic.cu
@@ -0,0 +1,30 @@
+// Based on clang/test/CodeGenCUDA/ptx-kernels.cu tailored for CIR current 
capabilities.
+// Tests basic device-side compilation with NVPTX target.
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -x cuda \
+// RUN:   -fcuda-is-device -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: cir.func {{.*}} @device_function()
+extern "C"
+__device__ void device_function() {}
+
+// CHECK: cir.func {{.*}} @global_function()
+// CHECK:   cir.call @device_function()
+extern "C"
+__global__ void global_function() {
+  device_function();
+}
+
+// Template kernel with explicit instantiation
+template <typename T> __global__ void templated_kernel(T param) {}
+template __global__ void templated_kernel<int>(int);
+// CHECK: cir.func {{.*}} @_Z16templated_kernelIiEvT_
+
+// Anonymous namespace kernel
+namespace {
+__global__ void anonymous_ns_kernel() {}
+// CHECK: cir.func {{.*}} @_ZN12_GLOBAL__N_119anonymous_ns_kernelEv
+}


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] a35b594 - [CIR][CUDA][HIP] Add NVPTX target info and CUDA/HIP global emission filtering (#177827)

Reply via email to