https://github.com/boxu-zhang created https://github.com/llvm/llvm-project/pull/140671
Last week, someone asking me to implement printf function on OpenCL with NVPTX which is strange for me, because I remember that the libdevice of CUDA has already implemented it and it should also work for OpenCL. However, it's not supported. Testing it with this simple case, the IR showed the kernel calls exact the printf function itself which is declared but not defined. ` int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); __kernel void test_printf() { printf("hello, printf on nvptx."); } ` And I did some research, found an obvious bug in "Sema::deduceOpenCLAddressSpace". In this function, the first 'if' statement checks whether the input 'Decl' type has address space, if it has, then just return. This logic is not correct for type of the 'format' string argument of 'printf' function. The '__constant const char * st' is the argument declaration which type is '__constant const char *' and has address space already(__constant, LangAS::opencl_constant). Call 'hasAddressSpace()' returns false with this type, which is obvious a bug. The reason is simple, it should check whether the 'Decl' type is a PointerType first, and then check whether the PointeeType has address space instead. The address space is only reasonable for PointeeType. See the commits for detail. Also I made some other changes to support implementing builtin printf with vprintf on OpenCL when the target is NVPTX. >From a3e883a4dc55029b8a7e3c591c3bc85289869686 Mon Sep 17 00:00:00 2001 From: "boxu.zhang" <boxu.zh...@hotmail.com> Date: Wed, 20 Dec 2023 17:35:25 +0800 Subject: [PATCH 1/3] Make 'UnrollMaxUpperBound' to be overridable by target. The default value is still 8 and the command line argument '--unroll-max-upperbound' takes final effect if provided. --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 4 ++++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f5114fa40c70a..735be3680aea0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -560,6 +560,10 @@ class TargetTransformInfo { // (set to UINT_MAX to disable). This does not apply in cases where the // loop is being fully unrolled. unsigned MaxCount; + /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound + /// to be overrided by a target gives more flexiblity on certain cases. + /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8. + unsigned MaxUpperBound; /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but /// applies even if full unrolling is selected. This allows a target to fall /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount. diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f14541a1a037e..7cfeb019af972 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.Count = 0; UP.DefaultUnrollRuntimeCount = 8; UP.MaxCount = std::numeric_limits<unsigned>::max(); + UP.MaxUpperBound = UnrollMaxUpperBound; UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max(); UP.BEInsns = 2; UP.Partial = false; @@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; + if (UnrollMaxUpperBound.getNumOccurrences() > 0) + UP.MaxUpperBound = UnrollMaxUpperBound; if (UnrollFullMaxCount.getNumOccurrences() > 0) UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) @@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, return TripCount; if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount && - MaxTripCount <= UnrollMaxUpperBound) + MaxTripCount <= UP.MaxUpperBound) return MaxTripCount; // if didn't return until here, should continue to other priorties @@ -952,7 +955,7 @@ bool llvm::computeUnrollCount( // cost of exact full unrolling. As such, if we have an exact count and // found it unprofitable, we'll never chose to bounded unroll. if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) && - MaxTripCount <= UnrollMaxUpperBound) { + MaxTripCount <= UP.MaxUpperBound) { UP.Count = MaxTripCount; if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, MaxTripCount, UCE, UP)) { @@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount( } // Don't unroll a small upper bound loop unless user or TTI asked to do so. - if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { + if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) { UP.Count = 0; return false; } >From ecc3e286cfe97ff033c00806658f6e69844c5434 Mon Sep 17 00:00:00 2001 From: "boxu.zhang" <boxu.zh...@hotmail.com> Date: Tue, 20 May 2025 12:22:45 +0800 Subject: [PATCH 2/3] [Clang][OpenCL][NVPTX] precommit test for using vprintf to implement builtin printf on OpenCL with NVPTX --- clang/test/CodeGenOpenCL/test-printf-nvptx.cl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/test-printf-nvptx.cl diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl new file mode 100644 index 0000000000000..6b8aa873a47b6 --- /dev/null +++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl @@ -0,0 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -cl-std=CL3.0 -triple nvptx-- -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NV %s + +int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); + +// NV-LABEL: define dso_local spir_kernel void @test_printf( +// NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { +// NV-NEXT: entry: +// NV-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR2:[0-9]+]] +// NV-NEXT: ret void +// +__kernel void test_printf() { + printf("hello, printf on nvptx."); +} >From 64c686105c830eab5a83cbdc62a67b07b0314072 Mon Sep 17 00:00:00 2001 From: "boxu.zhang" <boxu.zh...@hotmail.com> Date: Tue, 20 May 2025 12:28:02 +0800 Subject: [PATCH 3/3] [Clang][OpenCL][NVPTX] using vprintf to implement builtin printf on OpenCL with NVPTX --- clang/lib/AST/ASTContext.cpp | 5 +++++ clang/lib/AST/Decl.cpp | 4 +++- clang/lib/CodeGen/CGGPUBuiltin.cpp | 11 +++++++++-- clang/lib/Sema/SemaDecl.cpp | 8 ++++++-- clang/test/CodeGenOpenCL/test-printf-nvptx.cl | 2 +- 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 0395b3e47ab6f..8e3e6d3331b63 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -11481,6 +11481,11 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, // FIXME: There's no way to have a built-in with an rvalue ref arg. case 'C': Type = Type.withConst(); + // adjust 'const char *' to 'const char __constant *' on OpenCL + if (Context.getLangOpts().OpenCL && + Type.getTypePtr() == Context.CharTy.getTypePtr()) { + Type = Context.getAddrSpaceQualType(Type, LangAS::opencl_constant); + } break; case 'D': Type = Context.getVolatileType(Type); diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index c2ea155679193..a0b334e4323c8 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3600,8 +3600,10 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const { // OpenCL v1.2 s6.9.f - The library functions defined in // the C99 standard headers are not available. + // EXCEPTION: printf is supported for AMDGPU if (Context.getLangOpts().OpenCL && - Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID)) + Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID) && + BuiltinID != Builtin::BIprintf) return 0; // CUDA does not have device-side standard library. printf and malloc are the diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp index e465789a003eb..aa7fa5426bff7 100644 --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -135,6 +135,7 @@ bool containsNonScalarVarargs(CodeGenFunction *CGF, const CallArgList &Args) { RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, llvm::Function *Decl, bool WithSizeArg) { CodeGenModule &CGM = CGF->CGM; + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); CGBuilderTy &Builder = CGF->Builder; assert(E->getBuiltinCallee() == Builtin::BIprintf); assert(E->getNumArgs() >= 1); // printf always has at least one arg. @@ -155,9 +156,15 @@ RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args); llvm::Value *BufferPtr = r.first; + llvm::Value *Fmt = Args[0].getRValue(*CGF).getScalarVal(); - llvm::SmallVector<llvm::Value *, 3> Vec = { - Args[0].getRValue(*CGF).getScalarVal(), BufferPtr}; + // For OpenCL, the default addrspace of 'format' argument is LangAS::opencl_constant, + // however, the 'vprintf' requires it to be unqualified 'ptr' type. Do pointer cast if + // it's the case. + if (CGM.getContext().getLangOpts().OpenCL) + Fmt = Builder.CreatePointerCast(Fmt, llvm::PointerType::getUnqual(Ctx)); + + llvm::SmallVector<llvm::Value *, 3> Vec = {Fmt, BufferPtr}; if (WithSizeArg) { // Passing > 32bit of data as a local alloca doesn't work for nvptx or // amdgpu diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index ffbe317d55999..6792c768977c4 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7008,8 +7008,12 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) { } void Sema::deduceOpenCLAddressSpace(ValueDecl *Decl) { - if (Decl->getType().hasAddressSpace()) - return; + // Address space is only meaningful for pointer type + if (Decl->getType()->isPointerType()) { + const PointerType *T = dyn_cast<PointerType>(Decl->getType().getTypePtr()); + if (T->getPointeeType().hasAddressSpace()) + return; + } if (Decl->getType()->isDependentType()) return; if (VarDecl *Var = dyn_cast<VarDecl>(Decl)) { diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl index 6b8aa873a47b6..4a674c396c23c 100644 --- a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl +++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl @@ -6,7 +6,7 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))) // NV-LABEL: define dso_local spir_kernel void @test_printf( // NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { // NV-NEXT: entry: -// NV-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR2:[0-9]+]] +// NV-NEXT: [[TMP0:%.*]] = call i32 @vprintf(ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null) // NV-NEXT: ret void // __kernel void test_printf() { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits