[clang] [llvm] Boxu.zhang.llvm (PR #140671)

via cfe-commits Mon, 19 May 2025 21:05:00 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: boxu.zhang (boxu-zhang)

<details>
<summary>Changes</summary>

Last week, someone asking me to implement printf function on OpenCL with NVPTX 
which is strange for me, because I remember that the libdevice of CUDA has 
already implemented it and it should also work for OpenCL. However, it's not 
supported. Testing it with this simple case, the IR showed the kernel calls 
exact the printf function itself which is declared but not defined.
`
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 
2)));

__kernel void test_printf() {
  printf("hello, printf on nvptx.");
}
`

And I did some research, found an obvious bug in 
"Sema::deduceOpenCLAddressSpace". In this function, the first 'if' statement 
checks whether the input 'Decl' type has address space, if it has, then just 
return. This logic is not correct for type of the 'format' string argument of 
'printf' function. The '__constant const char * st' is the argument declaration 
which type is '__constant const char *' and has address space 
already(__constant, LangAS::opencl_constant). Call 'hasAddressSpace()' returns 
false with this type, which is obvious a bug. 
The reason is simple, it should check whether the 'Decl' type is a PointerType 
first, and then check whether the PointeeType has address space instead. The 
address space is only reasonable for PointeeType. 
See the commits for detail.

Also I made some other changes to support implementing builtin printf with 
vprintf on OpenCL when the target is NVPTX.

---
Full diff: https://github.com/llvm/llvm-project/pull/140671.diff


7 Files Affected:

- (modified) clang/lib/AST/ASTContext.cpp (+5) 
- (modified) clang/lib/AST/Decl.cpp (+3-1) 
- (modified) clang/lib/CodeGen/CGGPUBuiltin.cpp (+9-2) 
- (modified) clang/lib/Sema/SemaDecl.cpp (+6-2) 
- (added) clang/test/CodeGenOpenCL/test-printf-nvptx.cl (+14) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+4) 
- (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+6-3) 


``````````diff
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0395b3e47ab6f..8e3e6d3331b63 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -11481,6 +11481,11 @@ static QualType DecodeTypeFromStr(const char *&Str, 
const ASTContext &Context,
     // FIXME: There's no way to have a built-in with an rvalue ref arg.
     case 'C':
       Type = Type.withConst();
+      // adjust 'const char *' to 'const char __constant *' on OpenCL
+      if (Context.getLangOpts().OpenCL &&
+          Type.getTypePtr() == Context.CharTy.getTypePtr()) {
+        Type = Context.getAddrSpaceQualType(Type, LangAS::opencl_constant);
+      }
       break;
     case 'D':
       Type = Context.getVolatileType(Type);
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c2ea155679193..a0b334e4323c8 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3600,8 +3600,10 @@ unsigned FunctionDecl::getBuiltinID(bool 
ConsiderWrapperFunctions) const {
 
   // OpenCL v1.2 s6.9.f - The library functions defined in
   // the C99 standard headers are not available.
+  // EXCEPTION: printf is supported for AMDGPU
   if (Context.getLangOpts().OpenCL &&
-      Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
+      Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID) &&
+      BuiltinID != Builtin::BIprintf)
     return 0;
 
   // CUDA does not have device-side standard library. printf and malloc are the
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp 
b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index e465789a003eb..aa7fa5426bff7 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -135,6 +135,7 @@ bool containsNonScalarVarargs(CodeGenFunction *CGF, const 
CallArgList &Args) {
 RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF,
                                 llvm::Function *Decl, bool WithSizeArg) {
   CodeGenModule &CGM = CGF->CGM;
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
   CGBuilderTy &Builder = CGF->Builder;
   assert(E->getBuiltinCallee() == Builtin::BIprintf);
   assert(E->getNumArgs() >= 1); // printf always has at least one arg.
@@ -155,9 +156,15 @@ RValue EmitDevicePrintfCallExpr(const CallExpr *E, 
CodeGenFunction *CGF,
 
   auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args);
   llvm::Value *BufferPtr = r.first;
+  llvm::Value *Fmt = Args[0].getRValue(*CGF).getScalarVal();
 
-  llvm::SmallVector<llvm::Value *, 3> Vec = {
-      Args[0].getRValue(*CGF).getScalarVal(), BufferPtr};
+  // For OpenCL, the default addrspace of 'format' argument is 
LangAS::opencl_constant,
+  // however, the 'vprintf' requires it to be unqualified 'ptr' type. Do 
pointer cast if
+  // it's the case.
+  if (CGM.getContext().getLangOpts().OpenCL)
+    Fmt = Builder.CreatePointerCast(Fmt, llvm::PointerType::getUnqual(Ctx));
+
+  llvm::SmallVector<llvm::Value *, 3> Vec = {Fmt, BufferPtr};
   if (WithSizeArg) {
     // Passing > 32bit of data as a local alloca doesn't work for nvptx or
     // amdgpu
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index ffbe317d55999..6792c768977c4 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7008,8 +7008,12 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
 }
 
 void Sema::deduceOpenCLAddressSpace(ValueDecl *Decl) {
-  if (Decl->getType().hasAddressSpace())
-    return;
+  // Address space is only meaningful for pointer type
+  if (Decl->getType()->isPointerType()) {
+    const PointerType *T = dyn_cast<PointerType>(Decl->getType().getTypePtr());
+    if (T->getPointeeType().hasAddressSpace())
+      return;
+  }
   if (Decl->getType()->isDependentType())
     return;
   if (VarDecl *Var = dyn_cast<VarDecl>(Decl)) {
diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl 
b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
new file mode 100644
index 0000000000000..4a674c396c23c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 4
+// RUN: %clang_cc1 -cl-std=CL3.0 -triple nvptx-- -disable-llvm-passes 
-emit-llvm -o - %s | FileCheck -check-prefixes=NV %s
+
+int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 
2)));
+
+// NV-LABEL: define dso_local spir_kernel void @test_printf(
+// NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 
!kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 
!kernel_arg_type_qual !4 {
+// NV-NEXT:  entry:
+// NV-NEXT:    [[TMP0:%.*]] = call i32 @vprintf(ptr addrspacecast (ptr 
addrspace(4) @.str to ptr), ptr null)
+// NV-NEXT:    ret void
+//
+__kernel void test_printf() {
+  printf("hello, printf on nvptx.");
+}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f5114fa40c70a..735be3680aea0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -560,6 +560,10 @@ class TargetTransformInfo {
     // (set to UINT_MAX to disable). This does not apply in cases where the
     // loop is being fully unrolled.
     unsigned MaxCount;
+    /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound
+    /// to be overrided by a target gives more flexiblity on certain cases.
+    /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8.
+    unsigned MaxUpperBound;
     /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
     /// applies even if full unrolling is selected. This allows a target to 
fall
     /// back to Partial unrolling if full unrolling is above 
FullUnrollMaxCount.
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp 
b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f14541a1a037e..7cfeb019af972 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences 
llvm::gatherUnrollingPreferences(
   UP.Count = 0;
   UP.DefaultUnrollRuntimeCount = 8;
   UP.MaxCount = std::numeric_limits<unsigned>::max();
+  UP.MaxUpperBound = UnrollMaxUpperBound;
   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
   UP.BEInsns = 2;
   UP.Partial = false;
@@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences 
llvm::gatherUnrollingPreferences(
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
     UP.MaxCount = UnrollMaxCount;
+  if (UnrollMaxUpperBound.getNumOccurrences() > 0)
+    UP.MaxUpperBound = UnrollMaxUpperBound;
   if (UnrollFullMaxCount.getNumOccurrences() > 0)
     UP.FullUnrollMaxCount = UnrollFullMaxCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
@@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
     return TripCount;
 
   if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
-      MaxTripCount <= UnrollMaxUpperBound)
+      MaxTripCount <= UP.MaxUpperBound)
     return MaxTripCount;
 
   // if didn't return until here, should continue to other priorties
@@ -952,7 +955,7 @@ bool llvm::computeUnrollCount(
   // cost of exact full unrolling.  As such, if we have an exact count and
   // found it unprofitable, we'll never chose to bounded unroll.
   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
-      MaxTripCount <= UnrollMaxUpperBound) {
+      MaxTripCount <= UP.MaxUpperBound) {
     UP.Count = MaxTripCount;
     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
                                              MaxTripCount, UCE, UP)) {
@@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount(
   }
 
   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
-  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+  if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
     UP.Count = 0;
     return false;
   }

``````````

</details>


https://github.com/llvm/llvm-project/pull/140671
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] Boxu.zhang.llvm (PR #140671)

Reply via email to