https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/202095
>From a0750f6be992fec6b58e2859b1dce9f6f6d5ad87 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Thu, 4 Jun 2026 10:28:01 -0400 Subject: [PATCH 1/3] Reland HIP offload PGO compiler support (LLVM/Clang part of #177665) Relands the compiler portion of #177665 ("[PGO][AMDGPU] Add basic HIP offload PGO support"), which was reverted in #201416: the AMDGPU instrumentation lowering in LLVM (InstrProfiling.cpp / PGOInstrumentation.cpp), the HIP device-variable / shadow registration in Clang codegen (CGCUDANV.cpp), the supporting InstrProf.h / RuntimeLibcalls.td changes, and their tests. This commit restores the reverted state verbatim. It depends on the runtime ABI provided by the compiler-rt part (relanded separately); the next commit adds the driver wiring to link the ROCm device-profile library for HIP -fprofile-generate. --- clang/lib/CodeGen/CGCUDANV.cpp | 152 ++++++++++++++++++ .../test/CodeGenHIP/offload-pgo-sections.hip | 50 ++++++ llvm/include/llvm/IR/RuntimeLibcalls.td | 9 +- llvm/include/llvm/ProfileData/InstrProf.h | 5 + .../Instrumentation/InstrProfiling.cpp | 109 ++++++++++--- .../Instrumentation/PGOInstrumentation.cpp | 3 +- .../InstrProfiling/amdgpu-instrumentation.ll | 32 ++++ .../InstrProfiling/amdgpu-profc-arrays.ll | 26 +++ .../InstrProfiling/gpu-weak.ll | 36 +++++ .../amdgpu-disable-value-profiling.ll | 22 +++ 10 files changed, 415 insertions(+), 29 deletions(-) create mode 100644 clang/test/CodeGenHIP/offload-pgo-sections.hip create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll create mode 100644 llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll create mode 100644 llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 259b6c040706b..65f398af7902b 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/Format.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" using namespace clang; using namespace CodeGen; @@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::GlobalVariable *GpuBinaryHandle = nullptr; + /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global, + /// emitted only for HIP host compiles when PGO is on. Registered via + /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime + /// can locate the device-side table by name. + llvm::GlobalVariable *OffloadProfShadow = nullptr; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; /// Mangle context for device. @@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime { void transformManagedVars(); /// Create offloading entries to register globals in RDC mode. void createOffloadingEntries(); + /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global. + /// On the device side it is the populated 7-pointer section-bounds table. + /// On the host side it is a placeholder void* shadow stored in + /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC) + /// or createOffloadingEntries (RDC) so the runtime can locate the + /// device-side table by name. + void emitOffloadProfilingSections(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { } } + // Register the per-TU offload-profiling shadow so the host runtime can + // locate the matching device-side __llvm_profile_sections_<CUID>. We + // emit both __hipRegisterVar (so the HIP runtime can map the host + // shadow to the device symbol) and + // __llvm_profile_offload_register_shadow_variable (so the profile + // runtime adds the shadow to its drain list). + if (OffloadProfShadow) { + llvm::Constant *Name = + makeConstantString(std::string(OffloadProfShadow->getName())); + llvm::Value *RegisterVarArgs[] = { + &GpuBinaryHandlePtr, + OffloadProfShadow, + Name, + Name, + llvm::ConstantInt::get(IntTy, /*Extern=*/0), + llvm::ConstantInt::get(VarSizeTy, CGM.getDataLayout().getPointerSize()), + llvm::ConstantInt::get(IntTy, /*Constant=*/0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, RegisterVarArgs); + + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); + } + Builder.CreateRetVoid(); return RegisterKernelsFunc; } @@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() { I.Flags.getSurfTexType()); } } + + // Register the per-TU offload-profiling shadow. The offloading entry + // makes the linker-wrapper emit the host __hipRegisterVar call in the + // combined ctor. Separately emit a per-TU ctor that registers the + // shadow with the profile runtime's drain list. + if (OffloadProfShadow) { + llvm::offloading::emitOffloadingEntry( + M, Kind, OffloadProfShadow, OffloadProfShadow->getName(), + CGM.getDataLayout().getPointerSize(), + llvm::offloading::OffloadGlobalEntry, /*Data=*/0); + + llvm::LLVMContext &Ctx = M.getContext(); + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + auto *CtorFn = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, false), + llvm::GlobalValue::InternalLinkage, + "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), &M); + auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); + llvm::IRBuilder<> B(Entry); + B.CreateCall(RegisterShadow, {OffloadProfShadow}); + B.CreateRetVoid(); + llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); + } +} + +// For HIP host+device compiles with PGO enabled, emit the per-TU global +// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding +// section start/stop bounds for the names/counters/data sections plus the +// raw-version variable. Host side: an opaque void* shadow whose only +// purpose is to give the host-runtime a registered symbol name to look up +// via hipGetSymbolAddress; the actual device-side data lives in the +// matching device-side global. +void CGNVCUDARuntime::emitOffloadProfilingSections() { + if (!CGM.getLangOpts().HIP) + return; + if (!CGM.getCodeGenOpts().hasProfileInstr()) + return; + + StringRef CUIDHash = CGM.getContext().getCUIDHash(); + if (CUIDHash.empty()) + return; + + llvm::Module &M = CGM.getModule(); + llvm::LLVMContext &Ctx = M.getContext(); + std::string Name = ("__llvm_profile_sections_" + CUIDHash).str(); + + // If the global already exists (e.g. another TU was merged in), don't + // duplicate it. + if (M.getNamedValue(Name)) + return; + + if (CGM.getLangOpts().CUDAIsDevice) { + // Device side: emit the populated struct. Section start/stop symbols + // are linker-defined (ELF auto-generates __start_/__stop_ for any + // section whose name is a valid C identifier; AMDGPU is ELF). + unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); + auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); + auto getOrDeclare = [&](StringRef SymName) { + if (auto *GV = M.getNamedGlobal(SymName)) + return GV; + auto *GV = new llvm::GlobalVariable( + M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::HiddenVisibility); + return GV; + }; + auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); + if (!VersionGV) { + VersionGV = new llvm::GlobalVariable( + M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, + "__llvm_profile_raw_version", + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + } + + auto *StructTy = llvm::StructType::get( + Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); + llvm::Constant *Fields[] = { + getOrDeclare("__start___llvm_prf_names"), + getOrDeclare("__stop___llvm_prf_names"), + getOrDeclare("__start___llvm_prf_cnts"), + getOrDeclare("__stop___llvm_prf_cnts"), + getOrDeclare("__start___llvm_prf_data"), + getOrDeclare("__stop___llvm_prf_data"), + VersionGV, + }; + auto *Init = llvm::ConstantStruct::get(StructTy, Fields); + auto *GV = new llvm::GlobalVariable( + M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, + Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); + CGM.addCompilerUsedGlobal(GV); + return; + } + + // Host side: emit an opaque void* shadow. Layout doesn't matter — the + // runtime locates it by name via hipGetSymbolAddress and treats it as + // the address of the device-side struct. Registration with the HIP + // runtime is added by makeRegisterGlobalsFn (non-RDC) or + // createOffloadingEntries (RDC). + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + OffloadProfShadow = new llvm::GlobalVariable( + M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), Name); + CGM.addCompilerUsedGlobal(OffloadProfShadow); } // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { transformManagedVars(); + emitOffloadProfilingSections(); if (CGM.getLangOpts().CUDAIsDevice) { // Mark ODR-used device variables as compiler used to prevent it from being // eliminated by optimization. This is necessary for device variables diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip b/clang/test/CodeGenHIP/offload-pgo-sections.hip new file mode 100644 index 0000000000000..17c6fe7b9e609 --- /dev/null +++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip @@ -0,0 +1,50 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global +// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct +// in addrspace(1). Host compile: void* shadow registered with the HIP +// runtime and with the profile runtime's drain list. + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=DEV %s + +// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=HOST %s + +// Guard: no PGO -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +// Guard: no CUID -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ int helper(int x) { return x + 1; } +__global__ void kernel(int *p) { *p = helper(*p); } + +// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64 +// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version +// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] + +// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] +// HOST: define internal void @__hip_register_globals +// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]], +// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) + +// NONE-NOT: __llvm_profile_sections_ +// NONE-NOT: __llvm_profile_offload_register_shadow_variable diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index cbdc48a9a717f..37bad559f49e7 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -500,6 +500,9 @@ def DEOPTIMIZE : RuntimeLibcall; // Return address def RETURN_ADDRESS : RuntimeLibcall; +// GPU profiling +def PROFILE_INSTRUMENT_GPU : RuntimeLibcall; + // Clear cache def CLEAR_CACHE : RuntimeLibcall; def RISCV_FLUSH_ICACHE : RuntimeLibcall; @@ -2241,8 +2244,10 @@ def WindowsARM64ECSystemLibrary def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">; -// No calls. -def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>; +def __llvm_profile_instrument_gpu : RuntimeLibcallImpl<PROFILE_INSTRUMENT_GPU>; + +def AMDGPUSystemLibrary + : SystemRuntimeLibrary<isAMDGPU, (add __llvm_profile_instrument_gpu)>; //===----------------------------------------------------------------------===// // ARM Runtime Libcalls diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 33e55960509a7..90471e910bb50 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -122,6 +122,11 @@ inline StringRef getInstrProfValueProfMemOpFuncName() { /// Return the prefix of the name of the variables to function as a filter. inline StringRef getInstrProfVarPrefix() { return "__prof"; } +/// Return the name of the GPU wave-cooperative counter increment helper. +inline StringRef getInstrProfInstrumentGPUFuncName() { + return INSTR_PROF_INSTRUMENT_GPU_FUNC_STR; +} + /// Return the name prefix of variables containing instrumented function names. inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index dabd495cddd49..8e4ba41919768 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -33,14 +34,17 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" @@ -287,6 +291,8 @@ class InstrLowerer final { GlobalVariable *NamesVar = nullptr; size_t NamesSize = 0; + StructType *ProfileDataTy = nullptr; + // vector of counter load/store pairs to be register promoted. std::vector<LoadStorePair> PromotionCandidates; @@ -407,6 +413,9 @@ class InstrLowerer final { /// Create a static initializer for our data, on platforms that need it, /// and for any profile output file that was specified. void emitInitialization(); + + /// Return the __llvm_profile_data struct type. + StructType *getProfileDataTy(); }; /// @@ -1190,19 +1199,22 @@ void InstrLowerer::lowerTimestamp( void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) { auto *Addr = getCounterAddress(Inc); - IRBuilder<> Builder(Inc); if (isGPUProfTarget(M)) { - auto *I64Ty = Builder.getInt64Ty(); + auto *Int64Ty = Builder.getInt64Ty(); auto *PtrTy = Builder.getPtrTy(); auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()), - {PtrTy, PtrTy, I64Ty}, false); - auto Callee = - M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy); + {PtrTy, PtrTy, Int64Ty}, false); + FunctionCallee Callee = + M.getOrInsertFunction(RTLIB::RuntimeLibcallsInfo::getLibcallImplName( + RTLIB::impl___llvm_profile_instrument_gpu), + CalleeTy); Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy); Value *Uniform = ConstantPointerNull::get(PointerType::getUnqual(M.getContext())); - Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()}); + Value *StepI64 = + Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64"); + Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64}); } else if (Options.Atomic || AtomicCounterUpdateAll || (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) { Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(), @@ -1400,6 +1412,12 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) { if (shouldUsePublicSymbol(Fn)) return Fn; + // For GPU targets, weak functions cannot use private aliases because + // LTO may pick a different TU's copy, leaving the alias undefined + if (isGPUProfTarget(*Fn->getParent()) && + GlobalValue::isWeakForLinker(Fn->getLinkage())) + return Fn; + // When possible use a private alias to avoid symbolic relocations. auto *GA = GlobalAlias::create(GlobalValue::LinkageTypes::PrivateLinkage, Fn->getName() + ".local", Fn); @@ -1623,11 +1641,15 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc, } Ptr->setVisibility(Visibility); - // Put the counters and bitmaps in their own sections so linkers can - // remove unneeded sections. Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat())); Ptr->setLinkage(Linkage); - maybeSetComdat(Ptr, Fn, VarName); + if (isGPUProfTarget(M) && !Ptr->hasComdat()) { + Ptr->setComdat(M.getOrInsertComdat(VarName)); + Ptr->setLinkage(GlobalValue::LinkOnceODRLinkage); + Ptr->setVisibility(GlobalValue::ProtectedVisibility); + } else { + maybeSetComdat(Ptr, Fn, VarName); + } return Ptr; } @@ -1799,7 +1821,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); - auto *CounterPtr = PD.RegionCounters; + + Constant *CounterPtr = PD.RegionCounters; uint64_t NumBitmapBytes = PD.NumBitmapBytes; @@ -1807,11 +1830,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); auto *Int16Ty = Type::getInt16Ty(Ctx); auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); - Type *DataTypes[] = { -#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, -#include "llvm/ProfileData/InstrProfData.inc" - }; - auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes)); + auto *DataTy = getProfileDataTy(); Constant *FunctionAddr = getFuncAddrForProfData(Fn); @@ -1819,6 +1838,15 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + if (isGPUProfTarget(M)) { + // For GPU targets, weak functions need weak linkage for their profile data + // aliases to allow linker deduplication across TUs + if (GlobalValue::isWeakForLinker(Fn->getLinkage())) + Linkage = Fn->getLinkage(); + else + Linkage = GlobalValue::ExternalLinkage; + Visibility = GlobalValue::ProtectedVisibility; + } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1830,19 +1858,22 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + if (!isGPUProfTarget(M) && NS == 0 && + !(DataReferencedByCode && NeedComdat && !Renamed) && (TT.isOSBinFormatELF() || (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } - // AMDGPU objects are always ET_DYN, so non-local symbols with default - // visibility are preemptible. The CounterPtr label difference emits a REL32 - // relocation that lld rejects against preemptible targets. - if (TT.isAMDGPU() && !GlobalValue::isLocalLinkage(Linkage)) + // GPU-target ELF objects are always ET_DYN, so non-local symbols with + // default visibility are preemptible. The CounterPtr label difference + // emits a REL32 relocation that lld rejects against preemptible targets. + if (TT.isGPU() && TT.isOSBinFormatELF() && + !GlobalValue::isLocalLinkage(Linkage)) Visibility = GlobalValue::ProtectedVisibility; auto *Data = new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName); + Constant *RelativeCounterPtr; GlobalVariable *BitmapPtr = PD.RegionBitmaps; Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0); @@ -1883,7 +1914,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { Data->setSection( getInstrProfSectionName(DataSectionKind, TT.getObjectFormat())); Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); - maybeSetComdat(Data, Fn, CntsVarName); + if (isGPUProfTarget(M) && !Data->hasComdat()) { + Data->setComdat(M.getOrInsertComdat(CntsVarName)); + Data->setLinkage(GlobalValue::LinkOnceODRLinkage); + } else { + maybeSetComdat(Data, Fn, CntsVarName); + } PD.DataVar = Data; @@ -1961,16 +1997,18 @@ void InstrLowerer::emitNameData() { auto &Ctx = M.getContext(); auto *NamesVal = ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); - NamesVar = new GlobalVariable(M, NamesVal->getType(), true, - GlobalValue::PrivateLinkage, NamesVal, - getInstrProfNamesVarName()); + std::string NamesVarName = std::string(getInstrProfNamesVarName()); + NamesVar = + new GlobalVariable(M, NamesVal->getType(), true, + GlobalValue::PrivateLinkage, NamesVal, NamesVarName); NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); - NamesVar->setSection( + std::string NamesSectionName = ProfileCorrelate == InstrProfCorrelator::BINARY ? getInstrProfSectionName(IPSK_covname, TT.getObjectFormat()) - : getInstrProfSectionName(IPSK_name, TT.getObjectFormat())); + : getInstrProfSectionName(IPSK_name, TT.getObjectFormat()); + NamesVar->setSection(NamesSectionName); // On COFF, it's important to reduce the alignment down to 1 to prevent the // linker from inserting padding before the start of the names section or // between names entries. @@ -2179,3 +2217,22 @@ void createProfileSamplingVar(Module &M) { appendToCompilerUsed(M, SamplingVar); } } // namespace llvm + +// For GPU targets: Allocate contiguous arrays for all profile data. +// This solves the linker reordering problem by using ONE symbol per section +// type, so there's nothing for the linker to reorder. +StructType *InstrLowerer::getProfileDataTy() { + if (ProfileDataTy) + return ProfileDataTy; + + auto &Ctx = M.getContext(); + auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); + auto *Int16Ty = Type::getInt16Ty(Ctx); + auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); + Type *DataTypes[] = { +#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, +#include "llvm/ProfileData/InstrProfData.inc" + }; + ProfileDataTy = StructType::get(Ctx, ArrayRef(DataTypes)); + return ProfileDataTy; +} diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index db032d6fcad45..b6d07aa821e7f 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -380,7 +380,8 @@ class FunctionInstrumenter final { // another counter range within the context. bool isValueProfilingDisabled() const { return DisableValueProfiling || - InstrumentationType == PGOInstrumentationType::CTXPROF; + InstrumentationType == PGOInstrumentationType::CTXPROF || + M.getTargetTriple().isGPU(); } bool shouldInstrumentEntryBB() const { diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll new file mode 100644 index 0000000000000..efe53ab1ebdfb --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll @@ -0,0 +1,32 @@ +;; Test basic AMDGPU PGO instrumentation lowering. +;; Verifies that each instrumentation point lowers directly to a call to +;; __llvm_profile_instrument_gpu with a null uniform-counter argument. + +; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s + +@__hip_cuid_test01 = addrspace(1) global i8 0 +@__profn_test_kernel = private constant [11 x i8] c"test_kernel" + +define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out, i32 %n) { +entry: + call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 0) + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 1) + store i32 1, ptr addrspace(1) %out + br label %if.end + +if.end: + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) + +; CHECK-LABEL: define {{.*}} @test_kernel +; CHECK-NOT: @__llvm_profile_sampling_gpu +; CHECK: call void @__llvm_profile_instrument_gpu( +; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) @__profc_test_kernel to ptr), ptr null, i64 1) +; CHECK: call void @__llvm_profile_instrument_gpu( +; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([4 x i64], ptr addrspace(1) @__profc_test_kernel, i32 0, i32 1) to ptr), ptr null, i64 1) diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll new file mode 100644 index 0000000000000..eab78fb3591b1 --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll @@ -0,0 +1,26 @@ +;; Per-kernel __profc_* arrays land in section __llvm_prf_cnts with one slot +;; per counter, and counter increments lower to __llvm_profile_instrument_gpu +;; calls whose pointer argument is a GEP into the per-kernel array. + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instrprof < %s | FileCheck %s + +@__profn_kernel1 = private constant [7 x i8] c"kernel1" +@__profn_kernel2 = private constant [7 x i8] c"kernel2" + +; CHECK: @__profc_kernel1 = linkonce_odr protected addrspace(1) global [2 x i64] zeroinitializer, section "__llvm_prf_cnts" +; CHECK: @__profc_kernel2 = linkonce_odr protected addrspace(1) global [1 x i64] zeroinitializer, section "__llvm_prf_cnts" + +define amdgpu_kernel void @kernel1() { + call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 0) + call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 1) + ret void +} + +define amdgpu_kernel void @kernel2() { + call void @llvm.instrprof.increment(ptr @__profn_kernel2, i64 67890, i32 1, i32 0) + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) + +; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([2 x i64], ptr addrspace(1) @__profc_kernel1, i32 0, i32 1) to ptr), ptr null, i64 1) diff --git a/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll new file mode 100644 index 0000000000000..ce16f1ee3215f --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -passes=instrprof -S | FileCheck %s + +; Test that weak functions on GPU targets get weak linkage for their +; __profd_ aliases to allow linker deduplication across TUs. +; Non-weak functions get external linkage (default for aliases). + +target triple = "amdgcn-amd-amdhsa" + +@__hip_cuid_abc123 = addrspace(1) global i8 0 + +; AMDGPU GPU profiling lowers to per-function comdat globals (not aliases). +; CHECK: @__profd_weak_func = linkonce_odr protected addrspace(1) global +@__profn_weak_func = private constant [9 x i8] c"weak_func" + +define weak void @weak_func() { + call void @llvm.instrprof.increment(ptr @__profn_weak_func, i64 0, i32 1, i32 0) + ret void +} + +; CHECK: @__profd_weak_odr_func = linkonce_odr protected addrspace(1) global +@__profn_weak_odr_func = private constant [13 x i8] c"weak_odr_func" + +define weak_odr void @weak_odr_func() { + call void @llvm.instrprof.increment(ptr @__profn_weak_odr_func, i64 0, i32 1, i32 0) + ret void +} + +; CHECK: @__profd_normal_func = linkonce_odr protected addrspace(1) global +@__profn_normal_func = private constant [11 x i8] c"normal_func" + +define void @normal_func() { + call void @llvm.instrprof.increment(ptr @__profn_normal_func, i64 0, i32 1, i32 0) + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) diff --git a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll new file mode 100644 index 0000000000000..21b1d68004b13 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll @@ -0,0 +1,22 @@ +;; Test that value profiling (indirect call profiling) is disabled for GPU targets. +;; The device-side profiling runtime does not implement +;; __llvm_profile_instrument_target, so indirect call profiling must not be emitted. + +; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +@fptr = addrspace(1) global ptr null, align 8 + +;; Verify that regular block instrumentation IS emitted +; CHECK: call void @llvm.instrprof.increment + +;; Verify that value profiling for indirect calls is NOT emitted +; CHECK-NOT: call void @llvm.instrprof.value.profile + +define amdgpu_kernel void @test_indirect_call() { +entry: + %fp = load ptr, ptr addrspace(1) @fptr, align 8 + call void %fp() + ret void +} >From 554db6e96ecdd0d352c691e3c001f67b226dfcbd Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Thu, 4 Jun 2026 10:34:22 -0400 Subject: [PATCH 2/3] [PGO][HIP] Link clang_rt.profile_rocm on the host for HIP device PGO When building HIP with device PGO (-fprofile-generate / -fprofile-instr-generate) the host program must drain device-side profile counters. That collection runtime lives in the separate clang_rt.profile_rocm library (see the compiler-rt change), a self-contained superset of clang_rt.profile that also carries the ROCm collector and its interceptor dependency. Link clang_rt.profile_rocm into the host image for HIP device PGO. This is done in addOffloadRTLibs (which already knows the active offload kinds and links libamdhip64) for both the Linux/GNU and MSVC host toolchains, gated on the HIP offload kind, needsProfileRT, and the library being present in the resource directory. addOffloadRTLibs runs before addProfileRTLibs, so clang_rt.profile_rocm is emitted ahead of the base clang_rt.profile; being a superset it resolves all profile symbols and the base archive stays inert, which also avoids mixing the /MD ROCm runtime with the /MT base runtime in the host image on Windows. Non-HIP profile links and HIP links without profiling are unaffected. This change depends on the compiler-rt change that adds clang_rt.profile_rocm. --- clang/lib/Driver/ToolChains/Linux.cpp | 7 +++++ clang/lib/Driver/ToolChains/MSVC.cpp | 7 +++++ .../test/Driver/hip-profile-rocm-runtime.hip | 31 +++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 clang/test/Driver/hip-profile-rocm-runtime.hip diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 5f04afe34c554..d3c94c8addffa 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -902,6 +902,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, if (ActiveKinds & Action::OFK_HIP) CmdArgs.push_back( Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath())); + + // For HIP device PGO, link clang_rt.profile_rocm when available. It is a + // self-contained superset of clang_rt.profile, emitted first so the base + // archive stays inert. + if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) && + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); } void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs, diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 6bc58699fb007..8141f9f132421 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -592,6 +592,13 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") + RocmInstallation->getLibPath()), "amdhip64.lib"}); + + // For HIP device PGO, link clang_rt.profile_rocm when available. It is a + // self-contained superset of clang_rt.profile, emitted first so the base + // archive stays inert (avoiding a /MD-vs-/MT CRT mix in the host image). + if (needsProfileRT(Args) && + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); } } diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip b/clang/test/Driver/hip-profile-rocm-runtime.hip new file mode 100644 index 0000000000000..5e99d3f4f2fee --- /dev/null +++ b/clang/test/Driver/hip-profile-rocm-runtime.hip @@ -0,0 +1,31 @@ +// REQUIRES: x86-registered-target, amdgpu-registered-target +// UNSUPPORTED: system-windows + +// Build a fake resource dir containing both the base profile runtime and the +// ROCm device-profile runtime so the driver's existence check passes. +// RUN: rm -rf %t && mkdir -p %t/lib/x86_64-unknown-linux +// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile.a +// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile_rocm.a +// RUN: touch %t.o + +// HIP host link with PGO links clang_rt.profile_rocm. +// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \ +// RUN: -fprofile-instr-generate -resource-dir=%t \ +// RUN: --rocm-path=%S/Inputs/rocm %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=HIP-PGO %s +// profile_rocm must precede the base profile so the base archive stays inert. +// HIP-PGO: "{{.*}}libclang_rt.profile_rocm.a" +// HIP-PGO: "{{.*}}libclang_rt.profile.a" + +// Without profiling, the ROCm device-profile runtime is not linked. +// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \ +// RUN: -resource-dir=%t --rocm-path=%S/Inputs/rocm %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=HIP-NOPGO %s +// HIP-NOPGO-NOT: libclang_rt.profile_rocm.a + +// A non-HIP host link with PGO does not link the ROCm device-profile runtime. +// RUN: %clang -### --target=x86_64-unknown-linux \ +// RUN: -fprofile-instr-generate -resource-dir=%t %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=HOST-PGO %s +// HOST-PGO: "{{.*}}libclang_rt.profile.a" +// HOST-PGO-NOT: libclang_rt.profile_rocm.a >From c3d2da52d83793792de016cebe0525d6cd56ebfa Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Sun, 7 Jun 2026 16:48:15 -0400 Subject: [PATCH 3/3] [PGO][HIP] Fix HIP device profile collection and sections emission Several related HIP device-PGO fixes: Windows device collection. HIP rejects a hipMemcpy that reads past the bounds of a symbol registered with __hipRegisterVar, but device data/counters/names live in merged linker sections. Register a separate shadow for each device data, counters, and names symbol and copy each one by its exact hipGetSymbolSize size; this also lets static TUs with several kernels keep all their profile data. Open the device profile file in binary mode and pass the device names to the correct lprofWriteDataImpl arguments so llvm-profdata can read the raw profile. Open the versioned amdhip64_7.dll first, falling back to amdhip64.dll. Per-TU sections struct. Clang CodeGen emitted the __llvm_profile_sections_<CUID> struct (and its section start/stop references) for any profiling-enabled device TU. A TU with no instrumented device functions then referenced sections nothing populates, so the RDC device link failed under --no-undefined (and duplicated __llvm_prf_nm before per-CUID naming). Move the struct emission from CGCUDANV into the InstrProfiling pass, which emits it only when the TU has profile data; clang emits only the per-TU names-postfix marker, also making names unique per TU so RDC builds do not clash. Dynamic-module interceptors. The hipModuleLoad* interceptors live in a constructor-only object in clang_rt.profile_rocm that nothing references, so the linker drops it and dynamic-module programs collect no device profile. When linking clang_rt.profile_rocm, emit a force-link reference (-u on ELF, -include: on COFF); the constructor self-skips when the program does not use hipModuleLoad. Depends on #201607 (reland HIP offload PGO compiler support and link the device-profile runtime); that PR must land first. --- clang/lib/CodeGen/CGCUDANV.cpp | 113 +++-- clang/lib/Driver/ToolChains/Linux.cpp | 7 +- clang/lib/Driver/ToolChains/MSVC.cpp | 8 +- .../test/CodeGenHIP/offload-pgo-sections.hip | 54 ++- .../test/Driver/hip-profile-rocm-runtime.hip | 2 + compiler-rt/lib/profile/InstrProfilingFile.c | 11 +- .../profile/InstrProfilingPlatformROCm.cpp | 433 ++++++++++++++---- llvm/include/llvm/ProfileData/InstrProf.h | 4 + .../Instrumentation/InstrProfiling.cpp | 77 +++- 9 files changed, 560 insertions(+), 149 deletions(-) diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 65f398af7902b..f19813db189ef 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/ReplaceConstant.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Format.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -37,6 +38,10 @@ namespace { constexpr unsigned CudaFatMagic = 0x466243b1; constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" +static std::string getOffloadProfilingNamesVarName(llvm::StringRef CUIDHash) { + return (llvm::Twine(llvm::getInstrProfNamesVarName()) + "_" + CUIDHash).str(); +} + class CGNVCUDARuntime : public CGCUDARuntime { /// The prefix used for function calls and section names (CUDA, HIP, LLVM) @@ -78,6 +83,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime /// can locate the device-side table by name. llvm::GlobalVariable *OffloadProfShadow = nullptr; + struct OffloadProfSectionShadowInfo { + llvm::GlobalVariable *Shadow; + std::string DeviceName; + }; + llvm::SmallVector<OffloadProfSectionShadowInfo, 16> OffloadProfSectionShadows; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; /// Mangle context for device. @@ -774,6 +784,27 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); } + if (!OffloadProfSectionShadows.empty()) { + llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_section_shadow_variable"); + for (const auto &Info : OffloadProfSectionShadows) { + llvm::Constant *Name = makeConstantString(Info.DeviceName); + llvm::Value *RegisterVarArgs[] = { + &GpuBinaryHandlePtr, + Info.Shadow, + Name, + Name, + llvm::ConstantInt::get(IntTy, /*Extern=*/0), + llvm::ConstantInt::get(VarSizeTy, + CGM.getDataLayout().getPointerSize()), + llvm::ConstantInt::get(IntTy, /*Constant=*/0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, RegisterVarArgs); + Builder.CreateCall(RegisterSectionShadow, {Info.Shadow}); + } + } + Builder.CreateRetVoid(); return RegisterKernelsFunc; } @@ -1311,6 +1342,9 @@ void CGNVCUDARuntime::createOffloadingEntries() { llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, {PtrTy}, false), "__llvm_profile_offload_register_shadow_variable"); + llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_section_shadow_variable"); auto *CtorFn = llvm::Function::Create( llvm::FunctionType::get(VoidTy, false), llvm::GlobalValue::InternalLinkage, @@ -1318,6 +1352,13 @@ void CGNVCUDARuntime::createOffloadingEntries() { auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); llvm::IRBuilder<> B(Entry); B.CreateCall(RegisterShadow, {OffloadProfShadow}); + for (const auto &Info : OffloadProfSectionShadows) { + llvm::offloading::emitOffloadingEntry( + M, Kind, Info.Shadow, Info.DeviceName, + CGM.getDataLayout().getPointerSize(), + llvm::offloading::OffloadGlobalEntry, /*Data=*/0); + B.CreateCall(RegisterSectionShadow, {Info.Shadow}); + } B.CreateRetVoid(); llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); } @@ -1350,50 +1391,23 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() { return; if (CGM.getLangOpts().CUDAIsDevice) { - // Device side: emit the populated struct. Section start/stop symbols - // are linker-defined (ELF auto-generates __start_/__stop_ for any - // section whose name is a valid C identifier; AMDGPU is ELF). + // Device side: emit only the per-TU names postfix marker. The sections + // struct is emitted later by the InstrProfiling pass, which emits it only + // when the TU has profile data, avoiding dangling section references. unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); - auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); - auto getOrDeclare = [&](StringRef SymName) { - if (auto *GV = M.getNamedGlobal(SymName)) - return GV; - auto *GV = new llvm::GlobalVariable( - M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, - llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, - /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, - GlobalAS); - GV->setVisibility(llvm::GlobalValue::HiddenVisibility); - return GV; - }; - auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); - if (!VersionGV) { - VersionGV = new llvm::GlobalVariable( - M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, - llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, - "__llvm_profile_raw_version", + std::string NamesVarPostfixVarName = + std::string(llvm::getInstrProfNamesVarPostfixVarName()); + if (!M.getNamedValue(NamesVarPostfixVarName)) { + auto *NamesVarPostfix = llvm::ConstantDataArray::getString( + Ctx, (llvm::Twine("_") + CUIDHash).str(), true); + auto *NamesGV = new llvm::GlobalVariable( + M, NamesVarPostfix->getType(), /*isConstant=*/true, + llvm::GlobalValue::PrivateLinkage, NamesVarPostfix, + NamesVarPostfixVarName, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, GlobalAS); + CGM.addCompilerUsedGlobal(NamesGV); } - - auto *StructTy = llvm::StructType::get( - Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); - llvm::Constant *Fields[] = { - getOrDeclare("__start___llvm_prf_names"), - getOrDeclare("__stop___llvm_prf_names"), - getOrDeclare("__start___llvm_prf_cnts"), - getOrDeclare("__stop___llvm_prf_cnts"), - getOrDeclare("__start___llvm_prf_data"), - getOrDeclare("__stop___llvm_prf_data"), - VersionGV, - }; - auto *Init = llvm::ConstantStruct::get(StructTy, Fields); - auto *GV = new llvm::GlobalVariable( - M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, - Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, - GlobalAS); - GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); - CGM.addCompilerUsedGlobal(GV); return; } @@ -1407,6 +1421,25 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() { M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, llvm::ConstantPointerNull::get(PtrTy), Name); CGM.addCompilerUsedGlobal(OffloadProfShadow); + + auto AddSectionShadow = [&](StringRef Kind, StringRef DeviceName) { + std::string ShadowName = + (Twine("__llvm_profile_shadow_") + Kind + "_" + CUIDHash + "_" + + Twine(OffloadProfSectionShadows.size())) + .str(); + auto *Shadow = new llvm::GlobalVariable( + M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), ShadowName); + CGM.addCompilerUsedGlobal(Shadow); + OffloadProfSectionShadows.push_back({Shadow, DeviceName.str()}); + }; + + for (auto &&I : EmittedKernels) { + std::string KernelName = getDeviceSideName(cast<NamedDecl>(I.D)); + AddSectionShadow("data", std::string("__profd_") + KernelName); + AddSectionShadow("cnts", std::string("__profc_") + KernelName); + AddSectionShadow("names", getOffloadProfilingNamesVarName(CUIDHash)); + } } // Returns module constructor to be added. diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index d3c94c8addffa..65a15f9ed16e1 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -907,8 +907,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, // self-contained superset of clang_rt.profile, emitted first so the base // archive stays inert. if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) && - getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) { CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); + // Force-retain the constructor-only hipModuleLoad* interceptor object; its + // constructor self-skips when the program does not use hipModuleLoad. + CmdArgs.push_back("-u"); + CmdArgs.push_back("__llvm_profile_offload_register_dynamic_module"); + } } void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs, diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 8141f9f132421..f7c055eeea405 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -597,8 +597,14 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, // self-contained superset of clang_rt.profile, emitted first so the base // archive stays inert (avoiding a /MD-vs-/MT CRT mix in the host image). if (needsProfileRT(Args) && - getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) { CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); + // Force the linker to retain the constructor-only hipModuleLoad* + // interceptor object from clang_rt.profile_rocm (see Linux.cpp). The + // constructor self-skips for programs that do not use hipModuleLoad. + CmdArgs.push_back( + "-include:__llvm_profile_offload_register_dynamic_module"); + } } } diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip b/clang/test/CodeGenHIP/offload-pgo-sections.hip index 17c6fe7b9e609..d21ba30012212 100644 --- a/clang/test/CodeGenHIP/offload-pgo-sections.hip +++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip @@ -1,18 +1,26 @@ // REQUIRES: amdgpu-registered-target // REQUIRES: x86-registered-target -// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global -// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct -// in addrspace(1). Host compile: void* shadow registered with the HIP -// runtime and with the profile runtime's drain list. +// Verify the per-TU __llvm_profile_sections_<CUID> global for HIP+PGO. +// Device side: clang emits the names-postfix marker, and the InstrProfiling +// pass emits the populated 7-pointer struct in addrspace(1) -- but only when +// the TU actually has profile data records. Host compile: void* shadow +// registered with the HIP runtime and the profile runtime's drain list. +// The device struct is emitted by the InstrProfiling pass (not clang codegen), +// so run the pass to observe it. // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ // RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | opt -passes=instrprof -S \ // RUN: | FileCheck -check-prefix=DEV %s // RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \ // RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefix=HOST %s +// +// RUN: %clang_cc1 -triple x86_64-linux-gnu -fgpu-rdc --offload-new-driver \ +// RUN: -cuid=abc -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=HOST-RDC %s // Guard: no PGO -> no emission. // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ @@ -24,11 +32,25 @@ // RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefix=NONE %s +// Guard: PGO on but no instrumented device functions (all device code is +// constexpr/host-only) -> the pass must not emit the sections struct, so its +// section references don't dangle at the device link. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip \ +// RUN: -DEMPTY_DEVICE %s \ +// RUN: | opt -passes=instrprof -S \ +// RUN: | FileCheck -check-prefix=EMPTY %s + #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) +#ifdef EMPTY_DEVICE +// No __global__/instrumented device function: device code folds away. +__device__ constexpr int dead(int x) { return x + 1; } +#else __device__ int helper(int x) { return x + 1; } __global__ void kernel(int *p) { *p = helper(*p); } +#endif // DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8 // DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8 @@ -37,14 +59,34 @@ __global__ void kernel(int *p) { *p = helper(*p); } // DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8 // DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8 // DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64 -// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version -// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] +// DEV-DAG: @__llvm_prf_nm_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}section "__llvm_prf_names" +// DEV-DAG: @__llvm_profile_sections_[[CUID]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version +// DEV-DAG: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] // HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST-DAG: @__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}} = global ptr null // HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] // HOST: define internal void @__hip_register_globals // HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]], // HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) +// HOST: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}}) + +// HOST-RDC: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST-RDC-DAG: @__llvm_profile_shadow_data_[[CUID]]_0 = global ptr null +// HOST-RDC-DAG: @__llvm_profile_shadow_cnts_[[CUID]]_1 = global ptr null +// HOST-RDC-DAG: @__llvm_profile_shadow_names_[[CUID]]_2 = global ptr null +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_sections_[[CUID]] +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_data_[[CUID]]_0 +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_cnts_[[CUID]]_1 +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_names_[[CUID]]_2 +// HOST-RDC: define internal void @__llvm_profile_register_shadow.[[CUID]]() +// HOST-RDC: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) +// HOST-RDC-DAG: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_data_[[CUID]]_0) +// HOST-RDC-DAG: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_cnts_[[CUID]]_1) +// HOST-RDC-DAG: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_names_[[CUID]]_2) // NONE-NOT: __llvm_profile_sections_ // NONE-NOT: __llvm_profile_offload_register_shadow_variable + +// EMPTY-NOT: @__llvm_profile_sections_ +// EMPTY-NOT: @__start___llvm_prf_data diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip b/clang/test/Driver/hip-profile-rocm-runtime.hip index 5e99d3f4f2fee..fc82db4fc13c0 100644 --- a/clang/test/Driver/hip-profile-rocm-runtime.hip +++ b/clang/test/Driver/hip-profile-rocm-runtime.hip @@ -15,6 +15,8 @@ // RUN: | FileCheck -check-prefix=HIP-PGO %s // profile_rocm must precede the base profile so the base archive stays inert. // HIP-PGO: "{{.*}}libclang_rt.profile_rocm.a" +// The constructor-only hipModuleLoad* interceptor object is force-linked. +// HIP-PGO: "-u" "__llvm_profile_offload_register_dynamic_module" // HIP-PGO: "{{.*}}libclang_rt.profile.a" // Without profiling, the ROCm device-profile runtime is not linked. diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 9ea5a2638fac9..98a524392e54a 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1383,7 +1383,7 @@ int __llvm_write_custom_profile(const char *Target, TargetFilename[FilenameLength + 1 + TargetLength] = 0; /* Open and truncate target-specific PGO file */ - FILE *OutputFile = fopen(TargetFilename, "w"); + FILE *OutputFile = fopen(TargetFilename, "wb"); setProfileFile(OutputFile); if (!OutputFile) { @@ -1404,11 +1404,10 @@ int __llvm_write_custom_profile(const char *Target, if (VersionOverride) Version = *VersionOverride; - /* Write custom data to the file */ - ReturnValue = - lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin, - CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL, - NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version); + ReturnValue = lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, + CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NamesBegin, NamesEnd, + NULL, NULL, NULL, NULL, 0, Version); closeFileObject(OutputFile); // Restore SIGKILL. diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp index ee00c572e3a42..e972f22fdae8f 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp +++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp @@ -57,8 +57,9 @@ static void unlockDynamicModules(void) { } #endif -static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, - const char *Target); +struct OffloadSectionShadowGroup; +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, const char *Target, + const OffloadSectionShadowGroup *Sections); static int isVerboseMode() { static int IsVerbose = -1; @@ -72,6 +73,7 @@ static int isVerboseMode() { /* -------------------------------------------------------------------------- */ typedef int (*hipGetSymbolAddressTy)(void **, const void *); +typedef int (*hipGetSymbolSizeTy)(size_t *, const void *); typedef int (*hipMemcpyTy)(void *, const void *, size_t, int); typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *); typedef int (*hipGetDeviceCountTy)(int *); @@ -88,6 +90,7 @@ typedef struct { typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int); static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr; +static hipGetSymbolSizeTy pHipGetSymbolSize = nullptr; static hipMemcpyTy pHipMemcpy = nullptr; static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr; static hipGetDeviceCountTy pHipGetDeviceCount = nullptr; @@ -113,17 +116,25 @@ static void doEnsureHipLoaded(void) { } #ifdef _WIN32 - static const char HipLibName[] = "amdhip64.dll"; + const char *HipLibName = "amdhip64_7.dll"; #else - static const char HipLibName[] = "libamdhip64.so"; + const char *HipLibName = "libamdhip64.so"; #endif void *Handle = __interception::OpenLibrary(HipLibName); +#ifdef _WIN32 + if (!Handle) { + HipLibName = "amdhip64.dll"; + Handle = __interception::OpenLibrary(HipLibName); + } +#endif if (!Handle) return; pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol( Handle, "hipGetSymbolAddress"); + pHipGetSymbolSize = (hipGetSymbolSizeTy)__interception::LookupSymbol( + Handle, "hipGetSymbolSize"); pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy"); pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol( Handle, "hipModuleGetGlobal"); @@ -189,6 +200,11 @@ static int hipGetSymbolAddress(void **devPtr, const void *symbol) { return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1; } +static int hipGetSymbolSize(size_t *size, const void *symbol) { + ensureHipLoaded(); + return pHipGetSymbolSize ? pHipGetSymbolSize(size, symbol) : -1; +} + static int hipMemcpy(void *dest, const void *src, size_t len, int kind /*2=DToH*/) { ensureHipLoaded(); @@ -471,7 +487,7 @@ extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) { char TargetWithTU[64]; snprintf(TargetWithTU, sizeof(TargetWithTU), "%s.%d", ArchName, TUIndex); - if (processDeviceOffloadPrf(TU->DeviceVar, TUIndex, TargetWithTU) == 0) + if (processDeviceOffloadPrf(TU->DeviceVar, TargetWithTU, nullptr) == 0) TU->Processed = 1; else PROF_WARN("failed to process profile data for module %p TU %d\n", Ptr, @@ -505,23 +521,93 @@ static void **OffloadShadowVariables = nullptr; static int NumShadowVariables = 0; static int CapShadowVariables = 0; +struct OffloadSectionShadow { + void *Data; + void *Counters; + void *Names; +}; + +struct OffloadSectionShadowGroup { + OffloadSectionShadow *Shadows; + int NumShadows; + int CapShadows; + int NumSections; +}; + +static OffloadSectionShadowGroup *OffloadSectionShadowGroups = nullptr; +static int CapSectionShadowGroups = 0; + +static int ensureSectionShadowGroupCapacity(void) { + if (CapSectionShadowGroups >= CapShadowVariables) + return 0; + OffloadSectionShadowGroup *New = (OffloadSectionShadowGroup *)realloc( + OffloadSectionShadowGroups, CapShadowVariables * sizeof(*New)); + if (!New) + return -1; + __builtin_memset(New + CapSectionShadowGroups, 0, + (CapShadowVariables - CapSectionShadowGroups) * + sizeof(*New)); + OffloadSectionShadowGroups = New; + CapSectionShadowGroups = CapShadowVariables; + return 0; +} + +static int ensureSectionShadowCapacity(OffloadSectionShadowGroup *Group, + int MinCapacity) { + if (Group->CapShadows >= MinCapacity) + return 0; + int NewCap = Group->CapShadows ? Group->CapShadows * 2 : 4; + while (NewCap < MinCapacity) + NewCap *= 2; + OffloadSectionShadow *New = + (OffloadSectionShadow *)realloc(Group->Shadows, NewCap * sizeof(*New)); + if (!New) + return -1; + __builtin_memset(New + Group->CapShadows, 0, + (NewCap - Group->CapShadows) * sizeof(*New)); + Group->Shadows = New; + Group->CapShadows = NewCap; + return 0; +} + extern "C" void __llvm_profile_offload_register_shadow_variable(void *ptr) { if (growPtrArray(&OffloadShadowVariables, &NumShadowVariables, &CapShadowVariables, 64)) return; - OffloadShadowVariables[NumShadowVariables++] = ptr; + if (ensureSectionShadowGroupCapacity()) + return; + int Index = NumShadowVariables++; + OffloadShadowVariables[Index] = ptr; + __builtin_memset(&OffloadSectionShadowGroups[Index], 0, + sizeof(OffloadSectionShadowGroups[Index])); } -static void **OffloadSectionShadowVariables = nullptr; -static int NumSectionShadowVariables = 0; -static int CapSectionShadowVariables = 0; - extern "C" void __llvm_profile_offload_register_section_shadow_variable(void *ptr) { - if (growPtrArray(&OffloadSectionShadowVariables, &NumSectionShadowVariables, - &CapSectionShadowVariables, 64)) + if (NumShadowVariables == 0) return; - OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr; + + OffloadSectionShadowGroup *Group = + &OffloadSectionShadowGroups[NumShadowVariables - 1]; + int ShadowIndex = Group->NumSections / 3; + if (ensureSectionShadowCapacity(Group, ShadowIndex + 1)) + return; + if (ShadowIndex >= Group->NumShadows) + Group->NumShadows = ShadowIndex + 1; + + OffloadSectionShadow *Shadow = &Group->Shadows[ShadowIndex]; + switch (Group->NumSections % 3) { + case 0: + Shadow->Data = ptr; + break; + case 1: + Shadow->Counters = ptr; + break; + case 2: + Shadow->Names = ptr; + break; + } + ++Group->NumSections; } namespace { @@ -547,8 +633,41 @@ struct UniqueFree { } // namespace -static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, - const char *Target) { +static int getRegisteredSectionBounds(void *Shadow, void **DevicePtr, + size_t *Size) { + *DevicePtr = nullptr; + *Size = 0; + int AddrRc = hipGetSymbolAddress(DevicePtr, Shadow); + int SizeRc = hipGetSymbolSize(Size, Shadow); + return AddrRc == 0 && SizeRc == 0 && *DevicePtr && *Size > 0 ? 0 : -1; +} + +struct RegisteredSectionRange { + const void *Data; + const void *Counters; + const void *Names; + size_t DataSize; + size_t CountersSize; + size_t NamesSize; + size_t DataOffset; + size_t CountersOffset; + size_t NamesOffset; +}; + +static int +hasCompleteSectionShadows(const OffloadSectionShadowGroup *Sections) { + if (!Sections || Sections->NumShadows == 0 || Sections->NumSections % 3 != 0) + return 0; + for (int I = 0; I < Sections->NumShadows; ++I) { + if (!Sections->Shadows[I].Data || !Sections->Shadows[I].Counters || + !Sections->Shadows[I].Names) + return 0; + } + return 1; +} + +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, const char *Target, + const OffloadSectionShadowGroup *Sections) { __llvm_profile_gpu_sections HostSections; if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections), @@ -568,6 +687,10 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin; size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin; + int UseRegisteredSections = hasCompleteSectionShadows(Sections); + RegisteredSectionRange *RegisteredRanges = nullptr; + int NumRegisteredRanges = 0; + if (isVerboseMode()) PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu " "Names=[%p,%p]=%zu\n", @@ -599,78 +722,180 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, static size_t CachedDataSize = 0; // Owns freshly malloc'd buffers; release() transfers ownership to the cache. - UniqueFree CntsOwner, DataOwner, NamesOwner; + UniqueFree CntsOwner, DataOwner, NamesOwner, RegisteredRangeOwner; + + if (UseRegisteredSections) { + NumRegisteredRanges = Sections->NumShadows; + RegisteredRangeOwner.reset( + malloc(NumRegisteredRanges * sizeof(RegisteredSectionRange))); + RegisteredRanges = (RegisteredSectionRange *)RegisteredRangeOwner.get(); + if (!RegisteredRanges) { + PROF_ERR("%s\n", "failed to allocate registered section table"); + return -1; + } + __builtin_memset(RegisteredRanges, 0, + NumRegisteredRanges * sizeof(*RegisteredRanges)); + + size_t RegisteredDataSize = 0; + size_t RegisteredCountersSize = 0; + size_t RegisteredNamesSize = 0; + for (int I = 0; I < NumRegisteredRanges; ++I) { + void *Data = nullptr; + void *Counters = nullptr; + void *Names = nullptr; + size_t ThisDataSize = 0; + size_t ThisCountersSize = 0; + size_t ThisNamesSize = 0; + OffloadSectionShadow *Shadow = &Sections->Shadows[I]; + if (getRegisteredSectionBounds(Shadow->Data, &Data, &ThisDataSize) != 0 || + getRegisteredSectionBounds(Shadow->Counters, &Counters, + &ThisCountersSize) != 0 || + getRegisteredSectionBounds(Shadow->Names, &Names, &ThisNamesSize) != + 0) { + PROF_ERR("%s\n", "failed to get registered section bounds"); + return -1; + } - if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin && - CountersSize == CachedCntsSize) { - HostCountersBegin = CachedHostCnts; - CntsReused = 1; - if (isVerboseMode()) - PROF_NOTE("Reusing cached counters section (%zu bytes)\n", CountersSize); - } else if (CountersSize > 0) { - HostCountersBegin = (char *)malloc(CountersSize); - CntsOwner.reset(HostCountersBegin); - } + RegisteredRanges[I].Data = Data; + RegisteredRanges[I].Counters = Counters; + RegisteredRanges[I].Names = Names; + RegisteredRanges[I].DataSize = ThisDataSize; + RegisteredRanges[I].CountersSize = ThisCountersSize; + RegisteredRanges[I].NamesSize = ThisNamesSize; + RegisteredRanges[I].DataOffset = RegisteredDataSize; + RegisteredRanges[I].CountersOffset = RegisteredCountersSize; + RegisteredDataSize += ThisDataSize; + RegisteredCountersSize += ThisCountersSize; + + int ReuseNames = 0; + for (int J = 0; J < I; ++J) { + if (RegisteredRanges[J].Names == Names && + RegisteredRanges[J].NamesSize == ThisNamesSize) { + RegisteredRanges[I].NamesOffset = RegisteredRanges[J].NamesOffset; + ReuseNames = 1; + break; + } + } + if (!ReuseNames) { + RegisteredRanges[I].NamesOffset = RegisteredNamesSize; + RegisteredNamesSize += ThisNamesSize; + } + } - if (DataSize > 0 && DevDataBegin == CachedDevDataBegin && - DataSize == CachedDataSize) { - HostDataBegin = CachedHostData; - DataReused = 1; - if (isVerboseMode()) - PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize); - } else if (DataSize > 0) { + DataSize = RegisteredDataSize; + CountersSize = RegisteredCountersSize; + NamesSize = RegisteredNamesSize; HostDataBegin = (char *)malloc(DataSize); + HostCountersBegin = (char *)malloc(CountersSize); + HostNamesBegin = NamesSize ? (char *)malloc(NamesSize) : nullptr; DataOwner.reset(HostDataBegin); - } - - if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin && - NamesSize == CachedNamesSize) { - HostNamesBegin = CachedHostNames; - NamesReused = 1; - if (isVerboseMode()) - PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize); - } else if (NamesSize > 0) { - HostNamesBegin = (char *)malloc(NamesSize); + CntsOwner.reset(HostCountersBegin); NamesOwner.reset(HostNamesBegin); - } + if ((DataSize > 0 && !HostDataBegin) || + (CountersSize > 0 && !HostCountersBegin) || + (NamesSize > 0 && !HostNamesBegin)) { + PROF_ERR("%s\n", "failed to allocate host memory for device sections"); + return -1; + } - if ((DataSize > 0 && !HostDataBegin) || - (CountersSize > 0 && !HostCountersBegin) || - (NamesSize > 0 && !HostNamesBegin)) { - PROF_ERR("%s\n", "failed to allocate host memory for device sections"); - return -1; - } + for (int I = 0; I < NumRegisteredRanges; ++I) { + RegisteredSectionRange *R = &RegisteredRanges[I]; + if (memcpyDeviceToHost(HostDataBegin + R->DataOffset, R->Data, + R->DataSize) != 0 || + memcpyDeviceToHost(HostCountersBegin + R->CountersOffset, R->Counters, + R->CountersSize) != 0) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } - if ((DataSize > 0 && !DataReused && - memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) || - (CountersSize > 0 && !CntsReused && - memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) != - 0) || - (NamesSize > 0 && !NamesReused && - memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) { - PROF_ERR("%s\n", "failed to copy profile sections from device"); - return -1; - } + int CopyNames = 1; + for (int J = 0; J < I; ++J) { + if (RegisteredRanges[J].Names == R->Names && + RegisteredRanges[J].NamesSize == R->NamesSize) { + CopyNames = 0; + break; + } + } + if (CopyNames && R->NamesSize > 0 && + memcpyDeviceToHost(HostNamesBegin + R->NamesOffset, R->Names, + R->NamesSize) != 0) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } + } + } else { + if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin && + CountersSize == CachedCntsSize) { + HostCountersBegin = CachedHostCnts; + CntsReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached counters section (%zu bytes)\n", + CountersSize); + } else if (CountersSize > 0) { + HostCountersBegin = (char *)malloc(CountersSize); + CntsOwner.reset(HostCountersBegin); + } - /* Cache buffers so RDC-mode multi-shadow drains can reuse them. - * release() prevents the scope guards from freeing what the cache owns. */ - if (!CntsReused && CountersSize > 0) { - CachedDevCntsBegin = DevCntsBegin; - CachedHostCnts = HostCountersBegin; - CachedCntsSize = CountersSize; - CntsOwner.release(); - } - if (!DataReused && DataSize > 0) { - CachedDevDataBegin = DevDataBegin; - CachedHostData = HostDataBegin; - CachedDataSize = DataSize; - DataOwner.release(); - } - if (!NamesReused && NamesSize > 0) { - CachedDevNamesBegin = DevNamesBegin; - CachedHostNames = HostNamesBegin; - CachedNamesSize = NamesSize; - NamesOwner.release(); + if (DataSize > 0 && DevDataBegin == CachedDevDataBegin && + DataSize == CachedDataSize) { + HostDataBegin = CachedHostData; + DataReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize); + } else if (DataSize > 0) { + HostDataBegin = (char *)malloc(DataSize); + DataOwner.reset(HostDataBegin); + } + + if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin && + NamesSize == CachedNamesSize) { + HostNamesBegin = CachedHostNames; + NamesReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize); + } else if (NamesSize > 0) { + HostNamesBegin = (char *)malloc(NamesSize); + NamesOwner.reset(HostNamesBegin); + } + + if ((DataSize > 0 && !HostDataBegin) || + (CountersSize > 0 && !HostCountersBegin) || + (NamesSize > 0 && !HostNamesBegin)) { + PROF_ERR("%s\n", "failed to allocate host memory for device sections"); + return -1; + } + + if ((DataSize > 0 && !DataReused && + memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) || + (CountersSize > 0 && !CntsReused && + memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) != + 0) || + (NamesSize > 0 && !NamesReused && + memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } + + /* Cache buffers so RDC-mode multi-shadow drains can reuse them. + * release() prevents the scope guards from freeing what the cache owns. */ + if (!CntsReused && CountersSize > 0) { + CachedDevCntsBegin = DevCntsBegin; + CachedHostCnts = HostCountersBegin; + CachedCntsSize = CountersSize; + CntsOwner.release(); + } + if (!DataReused && DataSize > 0) { + CachedDevDataBegin = DevDataBegin; + CachedHostData = HostDataBegin; + CachedDataSize = DataSize; + DataOwner.release(); + } + if (!NamesReused && NamesSize > 0) { + CachedDevNamesBegin = DevNamesBegin; + CachedHostNames = HostNamesBegin; + CachedNamesSize = NamesSize; + NamesOwner.release(); + } } if (isVerboseMode()) @@ -721,16 +946,38 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, for (uint64_t i = 0; i < NumData; ++i) { if (RelocatedData[i].CounterPtr) { ptrdiff_t DeviceCounterPtrOffset = (ptrdiff_t)RelocatedData[i].CounterPtr; - const char *DeviceDataStructAddr = - (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data)); + size_t DataRecordOffset = i * sizeof(__llvm_profile_data); + const char *RangeDevDataBegin = (const char *)DevDataBegin; + const char *RangeDevCountersBegin = (const char *)DevCntsBegin; + size_t RangeCountersOffset = 0; + if (UseRegisteredSections) { + int FoundRange = 0; + for (int R = 0; R < NumRegisteredRanges; ++R) { + RegisteredSectionRange *Range = &RegisteredRanges[R]; + if (DataRecordOffset < Range->DataOffset || + DataRecordOffset >= Range->DataOffset + Range->DataSize) + continue; + RangeDevDataBegin = (const char *)Range->Data; + RangeDevCountersBegin = (const char *)Range->Counters; + RangeCountersOffset = Range->CountersOffset; + DataRecordOffset -= Range->DataOffset; + FoundRange = 1; + break; + } + if (!FoundRange) { + PROF_ERR("%s\n", "failed to locate profile data record range"); + return -1; + } + } + const char *DeviceDataStructAddr = RangeDevDataBegin + DataRecordOffset; const char *DeviceCountersAddr = DeviceDataStructAddr + DeviceCounterPtrOffset; ptrdiff_t OffsetIntoCountersSection = - DeviceCountersAddr - (const char *)DevCntsBegin; + DeviceCountersAddr - RangeDevCountersBegin; - ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters + - OffsetIntoCountersSection - - (i * sizeof(__llvm_profile_data)); + ptrdiff_t NewRelativeOffset = + DataSize + PaddingBytesBeforeCounters + RangeCountersOffset + + OffsetIntoCountersSection - (i * sizeof(__llvm_profile_data)); __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) + offsetof(__llvm_profile_data, CounterPtr), &NewRelativeOffset, sizeof(NewRelativeOffset)); @@ -743,9 +990,6 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, sizeof(RelocatedData[i].Values)); } - /* Target already encodes TUIndex when needed. */ - (void)TUIndex; - ret = __llvm_write_custom_profile( Target, (__llvm_profile_data *)BufDataBegin, (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin, @@ -761,8 +1005,8 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, return ret; } -static int processShadowVariable(void *ShadowVar, int TUIndex, - const char *Target) { +static int processShadowVariable(int Index, const char *Target) { + void *ShadowVar = OffloadShadowVariables[Index]; void *DeviceSections = nullptr; if (hipGetSymbolAddress(&DeviceSections, ShadowVar) != 0) { PROF_WARN("failed to get symbol address for shadow variable %p\n", @@ -770,7 +1014,12 @@ static int processShadowVariable(void *ShadowVar, int TUIndex, return -1; } /* DeviceSections points at the per-TU sections struct itself. */ - return processDeviceOffloadPrf(DeviceSections, TUIndex, Target); + const OffloadSectionShadowGroup *Sections = nullptr; + if (Index < CapSectionShadowGroups) + Sections = &OffloadSectionShadowGroups[Index]; + if (!hasCompleteSectionShadows(Sections)) + return 0; + return processDeviceOffloadPrf(DeviceSections, Target, Sections); } static int isHipAvailable(void) { @@ -815,7 +1064,7 @@ extern "C" int __llvm_profile_hip_collect_device_data(void) { snprintf(TargetWithIdx, sizeof(TargetWithIdx), "%s.%d", ArchName, i); Target = TargetWithIdx; } - if (processShadowVariable(OffloadShadowVariables[i], i, Target) != 0) + if (processShadowVariable(i, Target) != 0) Ret = -1; } } diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 90471e910bb50..0138e310304fa 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -152,6 +152,10 @@ inline StringRef getInstrProfVNodesVarName() { return "__llvm_prf_vnodes"; } /// of all function's PGO names. inline StringRef getInstrProfNamesVarName() { return "__llvm_prf_nm"; } +inline StringRef getInstrProfNamesVarPostfixVarName() { + return "__llvm_prf_nm_postfix"; +} + inline StringRef getInstrProfVTableNamesVarName() { return "__llvm_prf_vnm"; } /// Return the name of a covarage mapping variable (internal linkage) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 8e4ba41919768..6f6ad89126024 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1984,6 +1984,50 @@ void InstrLowerer::emitVNodes() { UsedVars.push_back(VNodesVar); } +// Build the per-TU device-PGO sections struct: section start/stop bounds for +// names/counters/data plus the raw version. Returns null if it already exists. +static GlobalVariable *emitGPUOffloadSectionsStruct(Module &M, + StringRef CUIDPostfix) { + std::string Name = ("__llvm_profile_sections" + CUIDPostfix).str(); + if (M.getNamedValue(Name)) + return nullptr; + + LLVMContext &Ctx = M.getContext(); + unsigned AS = M.getDataLayout().getDefaultGlobalsAddressSpace(); + auto Extern = [&](StringRef Sym, Type *Ty, bool IsConst, + GlobalValue::VisibilityTypes Vis) { + GlobalVariable *GV = M.getNamedGlobal(Sym); + if (!GV) { + GV = new GlobalVariable(M, Ty, IsConst, GlobalValue::ExternalLinkage, + nullptr, Sym, nullptr, + GlobalValue::NotThreadLocal, AS); + GV->setVisibility(Vis); + } + return GV; + }; + // Section bounds are hidden i8 markers; raw_version is an i64 constant. + auto *I8 = Type::getInt8Ty(Ctx); + auto Hidden = GlobalValue::HiddenVisibility; + Constant *Fields[] = {Extern("__start___llvm_prf_names", I8, false, Hidden), + Extern("__stop___llvm_prf_names", I8, false, Hidden), + Extern("__start___llvm_prf_cnts", I8, false, Hidden), + Extern("__stop___llvm_prf_cnts", I8, false, Hidden), + Extern("__start___llvm_prf_data", I8, false, Hidden), + Extern("__stop___llvm_prf_data", I8, false, Hidden), + Extern("__llvm_profile_raw_version", + Type::getInt64Ty(Ctx), true, + GlobalValue::DefaultVisibility)}; + auto *PtrTy = PointerType::get(Ctx, AS); + auto *STy = + StructType::get(Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); + auto *GV = new GlobalVariable(M, STy, /*isConstant=*/true, + GlobalValue::ExternalLinkage, + ConstantStruct::get(STy, Fields), Name, nullptr, + GlobalValue::NotThreadLocal, AS); + GV->setVisibility(GlobalValue::ProtectedVisibility); + return GV; +} + void InstrLowerer::emitNameData() { if (ReferencedNames.empty()) return; @@ -1998,9 +2042,28 @@ void InstrLowerer::emitNameData() { auto *NamesVal = ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); std::string NamesVarName = std::string(getInstrProfNamesVarName()); - NamesVar = - new GlobalVariable(M, NamesVal->getType(), true, - GlobalValue::PrivateLinkage, NamesVal, NamesVarName); + GlobalValue::LinkageTypes NamesLinkage = GlobalValue::PrivateLinkage; + GlobalValue::VisibilityTypes NamesVisibility = GlobalValue::DefaultVisibility; + std::string GPUCUIDPostfix; + if (isGPUProfTarget(M)) { + if (auto *GV = M.getNamedGlobal(getInstrProfNamesVarPostfixVarName())) { + if (auto *Init = + dyn_cast_or_null<ConstantDataArray>(GV->getInitializer())) { + if (Init->isCString()) { + GPUCUIDPostfix = Init->getAsCString().str(); + NamesVarName += GPUCUIDPostfix; + NamesLinkage = GlobalValue::ExternalLinkage; + NamesVisibility = GlobalValue::ProtectedVisibility; + removeFromUsedLists( + M, [GV](Constant *C) { return C->stripPointerCasts() == GV; }); + GV->eraseFromParent(); + } + } + } + } + NamesVar = new GlobalVariable(M, NamesVal->getType(), true, NamesLinkage, + NamesVal, NamesVarName); + NamesVar->setVisibility(NamesVisibility); NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); @@ -2019,6 +2082,14 @@ void InstrLowerer::emitNameData() { for (auto *NamePtr : ReferencedNames) NamePtr->eraseFromParent(); + + // Emit the device sections struct only when this TU produced profile data, so + // its section start/stop references are backed by a real section. + bool HasData = llvm::any_of(ProfileDataMap, + [](const auto &KV) { return KV.second.DataVar; }); + if (!GPUCUIDPostfix.empty() && HasData) + if (GlobalVariable *GV = emitGPUOffloadSectionsStruct(M, GPUCUIDPostfix)) + CompilerUsedVars.push_back(GV); } void InstrLowerer::emitVTableNames() { _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
