llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: boxu.zhang (boxu-zhang) <details> <summary>Changes</summary> Last week, someone asking me to implement printf function on OpenCL with NVPTX which is strange for me, because I remember that the libdevice of CUDA has already implemented it and it should also work for OpenCL. However, it's not supported. Testing it with this simple case, the IR showed the kernel calls exact the printf function itself which is declared but not defined. ` int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); __kernel void test_printf() { printf("hello, printf on nvptx."); } ` And I did some research, found an obvious bug in "Sema::deduceOpenCLAddressSpace". In this function, the first 'if' statement checks whether the input 'Decl' type has address space, if it has, then just return. This logic is not correct for type of the 'format' string argument of 'printf' function. The '__constant const char * st' is the argument declaration which type is '__constant const char *' and has address space already(__constant, LangAS::opencl_constant). Call 'hasAddressSpace()' returns false with this type, which is obvious a bug. The reason is simple, it should check whether the 'Decl' type is a PointerType first, and then check whether the PointeeType has address space instead. The address space is only reasonable for PointeeType. See the commits for detail. Also I made some other changes to support implementing builtin printf with vprintf on OpenCL when the target is NVPTX. --- Full diff: https://github.com/llvm/llvm-project/pull/140671.diff 7 Files Affected: - (modified) clang/lib/AST/ASTContext.cpp (+5) - (modified) clang/lib/AST/Decl.cpp (+3-1) - (modified) clang/lib/CodeGen/CGGPUBuiltin.cpp (+9-2) - (modified) clang/lib/Sema/SemaDecl.cpp (+6-2) - (added) clang/test/CodeGenOpenCL/test-printf-nvptx.cl (+14) - (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+4) - (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+6-3) ``````````diff diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 0395b3e47ab6f..8e3e6d3331b63 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -11481,6 +11481,11 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, // FIXME: There's no way to have a built-in with an rvalue ref arg. case 'C': Type = Type.withConst(); + // adjust 'const char *' to 'const char __constant *' on OpenCL + if (Context.getLangOpts().OpenCL && + Type.getTypePtr() == Context.CharTy.getTypePtr()) { + Type = Context.getAddrSpaceQualType(Type, LangAS::opencl_constant); + } break; case 'D': Type = Context.getVolatileType(Type); diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index c2ea155679193..a0b334e4323c8 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3600,8 +3600,10 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const { // OpenCL v1.2 s6.9.f - The library functions defined in // the C99 standard headers are not available. + // EXCEPTION: printf is supported for AMDGPU if (Context.getLangOpts().OpenCL && - Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID)) + Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID) && + BuiltinID != Builtin::BIprintf) return 0; // CUDA does not have device-side standard library. printf and malloc are the diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp index e465789a003eb..aa7fa5426bff7 100644 --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -135,6 +135,7 @@ bool containsNonScalarVarargs(CodeGenFunction *CGF, const CallArgList &Args) { RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, llvm::Function *Decl, bool WithSizeArg) { CodeGenModule &CGM = CGF->CGM; + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); CGBuilderTy &Builder = CGF->Builder; assert(E->getBuiltinCallee() == Builtin::BIprintf); assert(E->getNumArgs() >= 1); // printf always has at least one arg. @@ -155,9 +156,15 @@ RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args); llvm::Value *BufferPtr = r.first; + llvm::Value *Fmt = Args[0].getRValue(*CGF).getScalarVal(); - llvm::SmallVector<llvm::Value *, 3> Vec = { - Args[0].getRValue(*CGF).getScalarVal(), BufferPtr}; + // For OpenCL, the default addrspace of 'format' argument is LangAS::opencl_constant, + // however, the 'vprintf' requires it to be unqualified 'ptr' type. Do pointer cast if + // it's the case. + if (CGM.getContext().getLangOpts().OpenCL) + Fmt = Builder.CreatePointerCast(Fmt, llvm::PointerType::getUnqual(Ctx)); + + llvm::SmallVector<llvm::Value *, 3> Vec = {Fmt, BufferPtr}; if (WithSizeArg) { // Passing > 32bit of data as a local alloca doesn't work for nvptx or // amdgpu diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index ffbe317d55999..6792c768977c4 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7008,8 +7008,12 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) { } void Sema::deduceOpenCLAddressSpace(ValueDecl *Decl) { - if (Decl->getType().hasAddressSpace()) - return; + // Address space is only meaningful for pointer type + if (Decl->getType()->isPointerType()) { + const PointerType *T = dyn_cast<PointerType>(Decl->getType().getTypePtr()); + if (T->getPointeeType().hasAddressSpace()) + return; + } if (Decl->getType()->isDependentType()) return; if (VarDecl *Var = dyn_cast<VarDecl>(Decl)) { diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl new file mode 100644 index 0000000000000..4a674c396c23c --- /dev/null +++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl @@ -0,0 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -cl-std=CL3.0 -triple nvptx-- -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NV %s + +int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); + +// NV-LABEL: define dso_local spir_kernel void @test_printf( +// NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { +// NV-NEXT: entry: +// NV-NEXT: [[TMP0:%.*]] = call i32 @vprintf(ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null) +// NV-NEXT: ret void +// +__kernel void test_printf() { + printf("hello, printf on nvptx."); +} diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f5114fa40c70a..735be3680aea0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -560,6 +560,10 @@ class TargetTransformInfo { // (set to UINT_MAX to disable). This does not apply in cases where the // loop is being fully unrolled. unsigned MaxCount; + /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound + /// to be overrided by a target gives more flexiblity on certain cases. + /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8. + unsigned MaxUpperBound; /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but /// applies even if full unrolling is selected. This allows a target to fall /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount. diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f14541a1a037e..7cfeb019af972 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.Count = 0; UP.DefaultUnrollRuntimeCount = 8; UP.MaxCount = std::numeric_limits<unsigned>::max(); + UP.MaxUpperBound = UnrollMaxUpperBound; UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max(); UP.BEInsns = 2; UP.Partial = false; @@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; + if (UnrollMaxUpperBound.getNumOccurrences() > 0) + UP.MaxUpperBound = UnrollMaxUpperBound; if (UnrollFullMaxCount.getNumOccurrences() > 0) UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) @@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, return TripCount; if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount && - MaxTripCount <= UnrollMaxUpperBound) + MaxTripCount <= UP.MaxUpperBound) return MaxTripCount; // if didn't return until here, should continue to other priorties @@ -952,7 +955,7 @@ bool llvm::computeUnrollCount( // cost of exact full unrolling. As such, if we have an exact count and // found it unprofitable, we'll never chose to bounded unroll. if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) && - MaxTripCount <= UnrollMaxUpperBound) { + MaxTripCount <= UP.MaxUpperBound) { UP.Count = MaxTripCount; if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, MaxTripCount, UCE, UP)) { @@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount( } // Don't unroll a small upper bound loop unless user or TTI asked to do so. - if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { + if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) { UP.Count = 0; return false; } `````````` </details> https://github.com/llvm/llvm-project/pull/140671 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits