https://github.com/yxsamliu created https://github.com/llvm/llvm-project/pull/201499
Reland of #177665, reverted in #201416 along with its merged follow-ups #200101, #200127, #200111, and #200859. The revert was triggered because the merged change still broke Windows profiling builds that link `clang_rt.profile` with the static CRT (`/MT`): the ROCm collection runtime pulls in `RTInterception` + `sanitizer_common`, which are built `/MD`, forcing the profile archive to `/MD` and breaking static-CRT consumers (`LNK2019 __imp_getpid` …). This reland restores the reverted commits and fixes that last regression: `COMPILER_RT_BUILD_PROFILE_ROCM` now defaults off on Windows (already off on Apple), with the interceptor object-lib merge and the `Profile-*` test `/MD` CRT model gated on it. The Windows archive stays `/MT`, so static-CRT consumers link again. Windows device PGO is not wired up end-to-end yet, so nothing is lost; opt in with `-DCOMPILER_RT_BUILD_PROFILE_ROCM=ON` to get the `/MD` archive. >From a7249a2b2fe192f3fe3ea767ab1f7a665df047fd Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Wed, 3 Jun 2026 22:18:59 -0400 Subject: [PATCH 1/2] Revert "Revert "[PGO][AMDGPU] Add basic HIP offload PGO support (#177665)" (#201416)" This reverts commit 6cfa1a01a0f737ed1d54963810057fb1bd67a274. --- clang/lib/CodeGen/CGCUDANV.cpp | 152 +++ .../test/CodeGenHIP/offload-pgo-sections.hip | 50 + compiler-rt/CMakeLists.txt | 9 + compiler-rt/lib/profile/CMakeLists.txt | 59 +- compiler-rt/lib/profile/InstrProfilingFile.c | 30 + .../profile/InstrProfilingPlatformROCm.cpp | 897 ++++++++++++++++++ llvm/include/llvm/IR/RuntimeLibcalls.td | 9 +- llvm/include/llvm/ProfileData/InstrProf.h | 5 + .../Instrumentation/InstrProfiling.cpp | 109 ++- .../Instrumentation/PGOInstrumentation.cpp | 3 +- .../InstrProfiling/amdgpu-instrumentation.ll | 32 + .../InstrProfiling/amdgpu-profc-arrays.ll | 26 + .../InstrProfiling/gpu-weak.ll | 36 + .../amdgpu-disable-value-profiling.ll | 22 + 14 files changed, 1407 insertions(+), 32 deletions(-) create mode 100644 clang/test/CodeGenHIP/offload-pgo-sections.hip create mode 100644 compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll create mode 100644 llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll create mode 100644 llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 259b6c040706b..65f398af7902b 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/Format.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" using namespace clang; using namespace CodeGen; @@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::GlobalVariable *GpuBinaryHandle = nullptr; + /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global, + /// emitted only for HIP host compiles when PGO is on. Registered via + /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime + /// can locate the device-side table by name. + llvm::GlobalVariable *OffloadProfShadow = nullptr; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; /// Mangle context for device. @@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime { void transformManagedVars(); /// Create offloading entries to register globals in RDC mode. void createOffloadingEntries(); + /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global. + /// On the device side it is the populated 7-pointer section-bounds table. + /// On the host side it is a placeholder void* shadow stored in + /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC) + /// or createOffloadingEntries (RDC) so the runtime can locate the + /// device-side table by name. + void emitOffloadProfilingSections(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { } } + // Register the per-TU offload-profiling shadow so the host runtime can + // locate the matching device-side __llvm_profile_sections_<CUID>. We + // emit both __hipRegisterVar (so the HIP runtime can map the host + // shadow to the device symbol) and + // __llvm_profile_offload_register_shadow_variable (so the profile + // runtime adds the shadow to its drain list). + if (OffloadProfShadow) { + llvm::Constant *Name = + makeConstantString(std::string(OffloadProfShadow->getName())); + llvm::Value *RegisterVarArgs[] = { + &GpuBinaryHandlePtr, + OffloadProfShadow, + Name, + Name, + llvm::ConstantInt::get(IntTy, /*Extern=*/0), + llvm::ConstantInt::get(VarSizeTy, CGM.getDataLayout().getPointerSize()), + llvm::ConstantInt::get(IntTy, /*Constant=*/0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, RegisterVarArgs); + + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); + } + Builder.CreateRetVoid(); return RegisterKernelsFunc; } @@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() { I.Flags.getSurfTexType()); } } + + // Register the per-TU offload-profiling shadow. The offloading entry + // makes the linker-wrapper emit the host __hipRegisterVar call in the + // combined ctor. Separately emit a per-TU ctor that registers the + // shadow with the profile runtime's drain list. + if (OffloadProfShadow) { + llvm::offloading::emitOffloadingEntry( + M, Kind, OffloadProfShadow, OffloadProfShadow->getName(), + CGM.getDataLayout().getPointerSize(), + llvm::offloading::OffloadGlobalEntry, /*Data=*/0); + + llvm::LLVMContext &Ctx = M.getContext(); + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + auto *CtorFn = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, false), + llvm::GlobalValue::InternalLinkage, + "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), &M); + auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); + llvm::IRBuilder<> B(Entry); + B.CreateCall(RegisterShadow, {OffloadProfShadow}); + B.CreateRetVoid(); + llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); + } +} + +// For HIP host+device compiles with PGO enabled, emit the per-TU global +// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding +// section start/stop bounds for the names/counters/data sections plus the +// raw-version variable. Host side: an opaque void* shadow whose only +// purpose is to give the host-runtime a registered symbol name to look up +// via hipGetSymbolAddress; the actual device-side data lives in the +// matching device-side global. +void CGNVCUDARuntime::emitOffloadProfilingSections() { + if (!CGM.getLangOpts().HIP) + return; + if (!CGM.getCodeGenOpts().hasProfileInstr()) + return; + + StringRef CUIDHash = CGM.getContext().getCUIDHash(); + if (CUIDHash.empty()) + return; + + llvm::Module &M = CGM.getModule(); + llvm::LLVMContext &Ctx = M.getContext(); + std::string Name = ("__llvm_profile_sections_" + CUIDHash).str(); + + // If the global already exists (e.g. another TU was merged in), don't + // duplicate it. + if (M.getNamedValue(Name)) + return; + + if (CGM.getLangOpts().CUDAIsDevice) { + // Device side: emit the populated struct. Section start/stop symbols + // are linker-defined (ELF auto-generates __start_/__stop_ for any + // section whose name is a valid C identifier; AMDGPU is ELF). + unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); + auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); + auto getOrDeclare = [&](StringRef SymName) { + if (auto *GV = M.getNamedGlobal(SymName)) + return GV; + auto *GV = new llvm::GlobalVariable( + M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::HiddenVisibility); + return GV; + }; + auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); + if (!VersionGV) { + VersionGV = new llvm::GlobalVariable( + M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, + "__llvm_profile_raw_version", + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + } + + auto *StructTy = llvm::StructType::get( + Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); + llvm::Constant *Fields[] = { + getOrDeclare("__start___llvm_prf_names"), + getOrDeclare("__stop___llvm_prf_names"), + getOrDeclare("__start___llvm_prf_cnts"), + getOrDeclare("__stop___llvm_prf_cnts"), + getOrDeclare("__start___llvm_prf_data"), + getOrDeclare("__stop___llvm_prf_data"), + VersionGV, + }; + auto *Init = llvm::ConstantStruct::get(StructTy, Fields); + auto *GV = new llvm::GlobalVariable( + M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, + Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); + CGM.addCompilerUsedGlobal(GV); + return; + } + + // Host side: emit an opaque void* shadow. Layout doesn't matter — the + // runtime locates it by name via hipGetSymbolAddress and treats it as + // the address of the device-side struct. Registration with the HIP + // runtime is added by makeRegisterGlobalsFn (non-RDC) or + // createOffloadingEntries (RDC). + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + OffloadProfShadow = new llvm::GlobalVariable( + M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), Name); + CGM.addCompilerUsedGlobal(OffloadProfShadow); } // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { transformManagedVars(); + emitOffloadProfilingSections(); if (CGM.getLangOpts().CUDAIsDevice) { // Mark ODR-used device variables as compiler used to prevent it from being // eliminated by optimization. This is necessary for device variables diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip b/clang/test/CodeGenHIP/offload-pgo-sections.hip new file mode 100644 index 0000000000000..17c6fe7b9e609 --- /dev/null +++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip @@ -0,0 +1,50 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global +// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct +// in addrspace(1). Host compile: void* shadow registered with the HIP +// runtime and with the profile runtime's drain list. + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=DEV %s + +// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=HOST %s + +// Guard: no PGO -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +// Guard: no CUID -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ int helper(int x) { return x + 1; } +__global__ void kernel(int *p) { *p = helper(*p); } + +// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64 +// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version +// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] + +// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] +// HOST: define internal void @__hip_register_globals +// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]], +// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) + +// NONE-NOT: __llvm_profile_sections_ +// NONE-NOT: __llvm_profile_offload_register_shadow_variable diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index e88321d822f84..39034fd9ba67d 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -322,6 +322,15 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt atomic instead of libatom option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" OFF) +set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON) +if(APPLE) + set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF) +endif() +option(COMPILER_RT_BUILD_PROFILE_ROCM + "Build the host-side ROCm/HIP device profile collection runtime" + ${DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM}) +mark_as_advanced(COMPILER_RT_BUILD_PROFILE_ROCM) + include(config-ix) #================================ diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 8d9a773412a22..77db2477bb7c6 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -93,6 +93,9 @@ if (NOT COMPILER_RT_PROFILE_BAREMETAL) InstrProfilingUtil.c InstrProfilingValue.c ) + if(COMPILER_RT_BUILD_PROFILE_ROCM) + list(APPEND PROFILE_SOURCES InstrProfilingPlatformROCm.cpp) + endif() endif() set(PROFILE_HEADERS @@ -155,6 +158,43 @@ if(COMPILER_RT_PROFILE_BAREMETAL) -DCOMPILER_RT_PROFILE_BAREMETAL=1) endif() +# The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in +# RTInterception + sanitizer_common object libs. Those targets are only created +# when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is enabled +# (see lib/CMakeLists.txt). In a profile-only build the targets do not exist; +# skip both the object-lib merge and the ROCm source file so the static archive +# remains self-contained. +set(PROFILE_OBJECT_LIBS) +set(PROFILE_HAS_HIP_INTERCEPTOR FALSE) +if(COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL + AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH} + AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH} + AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH}) + # RTInterception references __sanitizer_internal_{memcpy,memset,memmove} and other + # sanitizer_common symbols; merge the same object libs as clang_rt.cfi (without + # coverage/symbolizer) so -fprofile-instr-generate links stay self-contained. + list(APPEND PROFILE_OBJECT_LIBS + RTInterception + RTSanitizerCommon + RTSanitizerCommonLibc) + set(PROFILE_HAS_HIP_INTERCEPTOR TRUE) +endif() + +if(NOT PROFILE_HAS_HIP_INTERCEPTOR) + list(REMOVE_ITEM PROFILE_SOURCES InstrProfilingPlatformROCm.cpp) +endif() + +# Only advertise the ROCm interceptor to InstrProfilingFile.c when its +# definition (InstrProfilingPlatformROCm.cpp) is actually compiled into the +# archive. Otherwise InstrProfilingFile.c references +# __llvm_profile_hip_collect_device_data with no definition; on COFF/Windows +# there is no weak-undefined fallback, so the link fails (see PR #200111). +if(COMPILER_RT_BUILD_PROFILE_ROCM AND PROFILE_HAS_HIP_INTERCEPTOR) + set(EXTRA_FLAGS + ${EXTRA_FLAGS} + -DCOMPILER_RT_BUILD_PROFILE_ROCM=1) +endif() + if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx") append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding EXTRA_FLAGS) append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib EXTRA_FLAGS) @@ -168,13 +208,24 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx") endif() if(MSVC) - # profile historically has only been supported with the static runtime - # on windows - set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) + # profile historically used the static CRT (/MT). When we merge RTInterception and + # RTSanitizerCommon (same object libs as clang_rt.cfi on ELF), those targets are + # built with MultiThreadedDLL (/MD) — see interception/CMakeLists.txt and + # sanitizer_common/CMakeLists.txt. Mixing /MD objects into a /MT libclang_rt.profile + # yields LNK2019 (__imp__stricmp from interception_win.cpp) and LNK4098 in Profile-*. + if(PROFILE_HAS_HIP_INTERCEPTOR) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL) + else() + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) + endif() endif() # We don't use the C++ Standard Library here, so avoid including it by mistake. append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS) +# C++ profile sources (e.g. InstrProfilingPlatformROCm.cpp) must not emit exception +# personality symbols: host libclang_rt.profile.a is linked from C code and from C++ +# tests that do not pull in __gxx_personality_v0 (Profile-* / premerge). +append_list_if(COMPILER_RT_HAS_FNO_EXCEPTIONS_FLAG -fno-exceptions EXTRA_FLAGS) # XRay uses C++ standard library headers. string(REGEX REPLACE "-?-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") @@ -200,6 +251,7 @@ if(APPLE) STATIC OS ${PROFILE_SUPPORTED_OS} ARCHS ${PROFILE_SUPPORTED_ARCH} + OBJECT_LIBS ${PROFILE_OBJECT_LIBS} CFLAGS ${EXTRA_FLAGS} SOURCES ${PROFILE_SOURCES} ADDITIONAL_HEADERS ${PROFILE_HEADERS} @@ -209,6 +261,7 @@ else() add_compiler_rt_runtime(clang_rt.profile STATIC ARCHS ${PROFILE_SUPPORTED_ARCH} + OBJECT_LIBS ${PROFILE_OBJECT_LIBS} CFLAGS ${EXTRA_FLAGS} SOURCES ${PROFILE_SOURCES} ADDITIONAL_HEADERS ${PROFILE_HEADERS} diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 71127b05aafb8..9ea5a2638fac9 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -41,6 +41,23 @@ #include "InstrProfilingPort.h" #include "InstrProfilingUtil.h" +/* Weak so non-HIP programs do not force InstrProfilingPlatformROCm.o (and its + * transitive sanitizer_common / interception dependencies) into the host link + * out of libclang_rt.profile.a. HIP programs emit strong references to other + * ROCm-runtime symbols (e.g. __llvm_profile_offload_register_shadow_variable) + * that pull in the strong definition. + * No COMPILER_RT_VISIBILITY: a hidden weak-undefined symbol is non-preemptible + * and the address test at the call site would fold to true. + * Windows: __declspec(selectany) is data-only, and the ROCm interceptor path + * is not used there, so keep the original strong extern. */ +#if COMPILER_RT_BUILD_PROFILE_ROCM +#if defined(_WIN32) +extern int __llvm_profile_hip_collect_device_data(void); +#else +__attribute__((weak)) int __llvm_profile_hip_collect_device_data(void); +#endif +#endif + /* From where is profile name specified. * The order the enumerators define their * precedence. Re-order them may lead to @@ -1198,6 +1215,19 @@ int __llvm_profile_write_file(void) { if (rc) PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno)); + /* On non-Windows the declaration is weak: only invoked when + * InstrProfilingPlatformROCm.o is in the link, which happens when the program + * references other ROCm-runtime symbols (HIP-with-PGO). Warning on failure is + * handled inside the callee. */ +#if COMPILER_RT_BUILD_PROFILE_ROCM +#if defined(_WIN32) + (void)__llvm_profile_hip_collect_device_data(); +#else + if (&__llvm_profile_hip_collect_device_data) + (void)__llvm_profile_hip_collect_device_data(); +#endif +#endif + // Restore SIGKILL. if (PDeathSig == 1) lprofRestoreSigKill(); diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp new file mode 100644 index 0000000000000..ee00c572e3a42 --- /dev/null +++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp @@ -0,0 +1,897 @@ +//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +extern "C" { +#include "InstrProfiling.h" +#include "InstrProfilingInternal.h" +#include "InstrProfilingPort.h" +} + +#include "interception/interception.h" +// C library headers (not <cstdio> etc.): clang_rt.profile is built with +// -nostdinc++ and avoids the C++ standard library (see profile/CMakeLists.txt). +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#else +#include <dlfcn.h> +#include <pthread.h> +#endif + +/* Serialize one-time HIP loader resolution and DynamicModules mutations. + * Inline to avoid a sanitizer_common dependency. */ +#ifdef _WIN32 +static INIT_ONCE HipLoadedOnce = INIT_ONCE_STATIC_INIT; +static CRITICAL_SECTION DynamicModulesLock; +static INIT_ONCE DynamicModulesLockInit = INIT_ONCE_STATIC_INIT; +static BOOL CALLBACK initDynamicModulesLockCb(PINIT_ONCE, PVOID, PVOID *) { + InitializeCriticalSection(&DynamicModulesLock); + return TRUE; +} +static void lockDynamicModules(void) { + InitOnceExecuteOnce(&DynamicModulesLockInit, initDynamicModulesLockCb, NULL, + NULL); + EnterCriticalSection(&DynamicModulesLock); +} +static void unlockDynamicModules(void) { + LeaveCriticalSection(&DynamicModulesLock); +} +#else +static pthread_once_t HipLoadedOnce = PTHREAD_ONCE_INIT; +static pthread_mutex_t DynamicModulesLock = PTHREAD_MUTEX_INITIALIZER; +static void lockDynamicModules(void) { + pthread_mutex_lock(&DynamicModulesLock); +} +static void unlockDynamicModules(void) { + pthread_mutex_unlock(&DynamicModulesLock); +} +#endif + +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, + const char *Target); + +static int isVerboseMode() { + static int IsVerbose = -1; + if (IsVerbose == -1) + IsVerbose = getenv("LLVM_PROFILE_VERBOSE") != nullptr; + return IsVerbose; +} + +/* -------------------------------------------------------------------------- */ +/* Dynamic loading of HIP runtime symbols */ +/* -------------------------------------------------------------------------- */ + +typedef int (*hipGetSymbolAddressTy)(void **, const void *); +typedef int (*hipMemcpyTy)(void *, const void *, size_t, int); +typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *); +typedef int (*hipGetDeviceCountTy)(int *); +typedef int (*hipGetDeviceTy)(int *); +typedef int (*hipSetDeviceTy)(int); + +/* Minimal hipDeviceProp_t (HIP 6.x R0600): only gcnArchName at offset 1160 + * is read. Padded to 4096 to tolerate ABI growth. */ +typedef struct { + char padding[1160]; + char gcnArchName[256]; + char tail_padding[2680]; +} HipDevicePropMinimal; +typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int); + +static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr; +static hipMemcpyTy pHipMemcpy = nullptr; +static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr; +static hipGetDeviceCountTy pHipGetDeviceCount = nullptr; +static hipGetDeviceTy pHipGetDevice = nullptr; +static hipSetDeviceTy pHipSetDevice = nullptr; +static hipGetDevicePropertiesTy pHipGetDeviceProperties = nullptr; + +static int NumDevices = 0; +/* 256 matches hipDeviceProp_t::gcnArchName, the source field width. */ +static char (*DeviceArchNames)[256] = nullptr; + +/* -------------------------------------------------------------------------- */ +/* Device-to-host copies */ +/* Keep HIP-only to avoid an HSA dependency. */ +/* -------------------------------------------------------------------------- */ + +static void doEnsureHipLoaded(void) { + if (!__interception::DynamicLoaderAvailable()) { + if (isVerboseMode()) + PROF_NOTE("%s", "Dynamic library loading not available - " + "HIP profiling disabled\n"); + return; + } + +#ifdef _WIN32 + static const char HipLibName[] = "amdhip64.dll"; +#else + static const char HipLibName[] = "libamdhip64.so"; +#endif + + void *Handle = __interception::OpenLibrary(HipLibName); + if (!Handle) + return; + + pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol( + Handle, "hipGetSymbolAddress"); + pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy"); + pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol( + Handle, "hipModuleGetGlobal"); + pHipGetDeviceCount = (hipGetDeviceCountTy)__interception::LookupSymbol( + Handle, "hipGetDeviceCount"); + pHipGetDevice = + (hipGetDeviceTy)__interception::LookupSymbol(Handle, "hipGetDevice"); + pHipSetDevice = + (hipSetDeviceTy)__interception::LookupSymbol(Handle, "hipSetDevice"); + pHipGetDeviceProperties = + (hipGetDevicePropertiesTy)__interception::LookupSymbol( + Handle, "hipGetDevicePropertiesR0600"); + if (!pHipGetDeviceProperties) + pHipGetDeviceProperties = + (hipGetDevicePropertiesTy)__interception::LookupSymbol( + Handle, "hipGetDeviceProperties"); + + if (pHipGetDeviceCount && pHipGetDeviceProperties) { + int Count = 0; + if (pHipGetDeviceCount(&Count) == 0 && Count > 0) { + DeviceArchNames = (char (*)[256])calloc(Count, sizeof(*DeviceArchNames)); + if (!DeviceArchNames) { + PROF_ERR("%s\n", "failed to allocate device arch name table"); + return; + } + HipDevicePropMinimal Prop; + for (int i = 0; i < Count; ++i) { + __builtin_memset(&Prop, 0, sizeof(Prop)); + if (pHipGetDeviceProperties(&Prop, i) == 0) { + strncpy(DeviceArchNames[i], Prop.gcnArchName, + sizeof(DeviceArchNames[i]) - 1); + DeviceArchNames[i][sizeof(DeviceArchNames[i]) - 1] = '\0'; + if (isVerboseMode()) + PROF_NOTE("Device %d arch: %s\n", i, DeviceArchNames[i]); + } + } + NumDevices = Count; + } + } +} + +#ifdef _WIN32 +static BOOL CALLBACK ensureHipLoadedCb(PINIT_ONCE, PVOID, PVOID *) { + doEnsureHipLoaded(); + return TRUE; +} +#endif + +static void ensureHipLoaded(void) { +#ifdef _WIN32 + InitOnceExecuteOnce(&HipLoadedOnce, ensureHipLoadedCb, NULL, NULL); +#else + pthread_once(&HipLoadedOnce, doEnsureHipLoaded); +#endif +} + +/* -------------------------------------------------------------------------- */ +/* Public wrappers that forward to the loaded HIP symbols */ +/* -------------------------------------------------------------------------- */ + +static int hipGetSymbolAddress(void **devPtr, const void *symbol) { + ensureHipLoaded(); + return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1; +} + +static int hipMemcpy(void *dest, const void *src, size_t len, + int kind /*2=DToH*/) { + ensureHipLoaded(); + return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1; +} + +/* Device section symbols must be registered with CLR first; otherwise + * hipMemcpy may take a CPU path and crash. */ +static int memcpyDeviceToHost(void *Dst, const void *Src, size_t Size) { + return hipMemcpy(Dst, Src, Size, 2 /* DToH */); +} + +static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module, + const char *Name) { + ensureHipLoaded(); + return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name) + : -1; +} + +static int hipGetDevice(int *DeviceId) { + ensureHipLoaded(); + return pHipGetDevice ? pHipGetDevice(DeviceId) : -1; +} + +static int hipSetDevice(int DeviceId) { + ensureHipLoaded(); + return pHipSetDevice ? pHipSetDevice(DeviceId) : -1; +} + +static const char *getDeviceArchName(int DeviceId) { + if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0]) + return "amdgpu"; + return DeviceArchNames[DeviceId]; +} + +/* -------------------------------------------------------------------------- */ +/* Dynamic module tracking */ +/* -------------------------------------------------------------------------- */ + +/* Per-TU profile entry inside a dynamic module. + * A single dynamic module may contain multiple TUs (e.g. -fgpu-rdc). */ +typedef struct { + void *DeviceVar; /* device address of __llvm_profile_sections_<CUID> */ + int Processed; /* 0 = not yet collected, 1 = data already copied */ +} OffloadDynamicTUInfo; + +/* One entry per hipModuleLoad call. */ +typedef struct { + void *ModulePtr; /* hipModule_t handle */ + OffloadDynamicTUInfo *TUs; /* array of per-TU entries */ + int NumTUs; + int CapTUs; +} OffloadDynamicModuleInfo; + +static OffloadDynamicModuleInfo *DynamicModules = nullptr; +static int NumDynamicModules = 0; +static int CapDynamicModules = 0; + +/* -------------------------------------------------------------------------- */ +/* ELF symbol enumeration (manual parse: compiler-rt cannot link LLVM Support) + */ +/* -------------------------------------------------------------------------- */ + +#if __has_include(<elf.h>) +#include <elf.h> + +/* Callback invoked for every matching symbol name found in the ELF image. + * Return 0 to continue iteration, non-zero to stop. */ +typedef int (*SymbolCallback)(const char *Name, void *UserData); + +/* If Image is a clang offload bundle, return a pointer to the first embedded + * ELF. Returns Image if not a bundle, nullptr if a bundle holds no ELF. */ +static const void *unwrapOffloadBundle(const void *Image) { + static const char BundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__"; + if (memcmp(Image, BundleMagic, sizeof(BundleMagic) - 1) != 0) + return Image; /* Not a bundle, return as-is. */ + + const char *Buf = (const char *)Image; + uint64_t NumEntries; + __builtin_memcpy(&NumEntries, Buf + sizeof(BundleMagic) - 1, + sizeof(uint64_t)); + + /* Walk the entry table (starts at offset 32). */ + const char *Cursor = Buf + 32; + for (uint64_t I = 0; I < NumEntries; ++I) { + uint64_t EntryOffset, EntrySize, IDSize; + __builtin_memcpy(&EntryOffset, Cursor, sizeof(EntryOffset)); + Cursor += sizeof(EntryOffset); + __builtin_memcpy(&EntrySize, Cursor, sizeof(EntrySize)); + Cursor += sizeof(EntrySize); + __builtin_memcpy(&IDSize, Cursor, sizeof(IDSize)); + Cursor += sizeof(IDSize); + Cursor += IDSize; /* skip entry ID */ + + if (EntrySize >= sizeof(Elf64_Ehdr)) { + const Elf64_Ehdr *E = (const Elf64_Ehdr *)(Buf + EntryOffset); + if (E->e_ident[EI_MAG0] == ELFMAG0 && E->e_ident[EI_MAG1] == ELFMAG1 && + E->e_ident[EI_MAG2] == ELFMAG2 && E->e_ident[EI_MAG3] == ELFMAG3) { + return (const void *)(Buf + EntryOffset); + } + } + } + + PROF_WARN("%s", "offload bundle contains no valid ELF entries\n"); + return nullptr; +} + +/* Invoke CB for every global symbol in Image (an AMDGPU ELF or offload bundle) + * whose name starts with PREFIX. Image may be null. */ +static void enumerateElfSymbols(const void *Image, const char *Prefix, + SymbolCallback CB, void *UserData) { + if (!Image) + return; + + Image = unwrapOffloadBundle(Image); + if (!Image) + return; + + const Elf64_Ehdr *Ehdr = (const Elf64_Ehdr *)Image; + if (Ehdr->e_ident[EI_MAG0] != ELFMAG0 || Ehdr->e_ident[EI_MAG1] != ELFMAG1 || + Ehdr->e_ident[EI_MAG2] != ELFMAG2 || Ehdr->e_ident[EI_MAG3] != ELFMAG3) { + if (isVerboseMode()) + PROF_NOTE("%s", "Image is not a valid ELF, skipping enumeration\n"); + return; + } + + size_t PrefixLen = strlen(Prefix); + const char *Base = (const char *)Image; + const Elf64_Shdr *Shdrs = (const Elf64_Shdr *)(Base + Ehdr->e_shoff); + + for (int i = 0; i < Ehdr->e_shnum; ++i) { + if (Shdrs[i].sh_type != SHT_SYMTAB) + continue; + + const Elf64_Sym *Syms = (const Elf64_Sym *)(Base + Shdrs[i].sh_offset); + int NumSyms = Shdrs[i].sh_size / sizeof(Elf64_Sym); + /* String table is the section referenced by sh_link. */ + const char *StrTab = Base + Shdrs[Shdrs[i].sh_link].sh_offset; + + for (int j = 0; j < NumSyms; ++j) { + if (Syms[j].st_name == 0) + continue; + const char *Name = StrTab + Syms[j].st_name; + if (strncmp(Name, Prefix, PrefixLen) == 0) { + if (CB(Name, UserData)) + return; + } + } + } +} + +/* State passed through the enumeration callback. */ +typedef struct { + void *Module; /* hipModule_t */ + OffloadDynamicModuleInfo *ModInfo; +} EnumState; + +/* Register one __llvm_profile_sections_<CUID> symbol on the module entry. + * hipModuleGetGlobal also registers the device address with CLR so hipMemcpy + * can copy from it later. */ +static int registerPrfSymbol(const char *Name, void *UserData) { + EnumState *S = (EnumState *)UserData; + OffloadDynamicModuleInfo *MI = S->ModInfo; + + /* The symbol is the per-TU sections struct itself, not a pointer + * indirection, so this address is the hipMemcpy source. */ + void *DeviceVar = nullptr; + size_t Bytes = 0; + if (hipModuleGetGlobal(&DeviceVar, &Bytes, S->Module, Name) != 0) { + PROF_WARN("failed to get symbol %s for module %p\n", Name, S->Module); + return 0; /* continue */ + } + + if (MI->NumTUs >= MI->CapTUs) { + int NewCap = MI->CapTUs ? MI->CapTUs * 2 : 4; + OffloadDynamicTUInfo *New = (OffloadDynamicTUInfo *)realloc( + MI->TUs, NewCap * sizeof(OffloadDynamicTUInfo)); + if (!New) { + PROF_ERR("%s\n", "failed to grow TU array"); + return 0; + } + MI->TUs = New; + MI->CapTUs = NewCap; + } + OffloadDynamicTUInfo *TU = &MI->TUs[MI->NumTUs++]; + TU->DeviceVar = DeviceVar; + TU->Processed = 0; + + (void)Name; + return 0; /* continue enumeration */ +} + +#endif /* __has_include(<elf.h>) */ + +/* -------------------------------------------------------------------------- */ +/* Registration / un-registration helpers */ +/* -------------------------------------------------------------------------- */ + +extern "C" void +__llvm_profile_offload_register_dynamic_module(int ModuleLoadRc, void **Ptr, + const void *Image) { + if (ModuleLoadRc) + return; + + lockDynamicModules(); + + if (isVerboseMode()) + PROF_NOTE("Registering loaded module %d: rc=%d, module=%p, image=%p\n", + NumDynamicModules, ModuleLoadRc, *Ptr, Image); + + if (NumDynamicModules >= CapDynamicModules) { + int NewCap = CapDynamicModules ? CapDynamicModules * 2 : 64; + OffloadDynamicModuleInfo *New = (OffloadDynamicModuleInfo *)realloc( + DynamicModules, NewCap * sizeof(OffloadDynamicModuleInfo)); + if (!New) { + unlockDynamicModules(); + return; + } + DynamicModules = New; + CapDynamicModules = NewCap; + } + + OffloadDynamicModuleInfo *MI = &DynamicModules[NumDynamicModules++]; + MI->ModulePtr = *Ptr; + MI->TUs = nullptr; + MI->NumTUs = 0; + MI->CapTUs = 0; + + /* Dynamic-module profiling needs ELF parsing for symbol enumeration. */ +#if __has_include(<elf.h>) + EnumState State = {*Ptr, MI}; + enumerateElfSymbols(Image, "__llvm_profile_sections_", registerPrfSymbol, + &State); +#else + (void)Image; + if (isVerboseMode()) + PROF_NOTE("%s", + "Dynamic module profiling not supported on this platform\n"); +#endif + + if (MI->NumTUs == 0) { + PROF_WARN("no __llvm_profile_sections_* symbols found in module %p\n", + *Ptr); + } else if (isVerboseMode()) { + PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs); + } + + unlockDynamicModules(); +} + +extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) { + lockDynamicModules(); + for (int i = 0; i < NumDynamicModules; ++i) { + OffloadDynamicModuleInfo *MI = &DynamicModules[i]; + + /* HIP recycles hipModule_t addresses; drained slots are cleared so a + * recycled handle finds the new slot, not the dead one. */ + if (MI->ModulePtr != Ptr) + continue; + + if (isVerboseMode()) + PROF_NOTE("Unregistering module %p (%d TUs)\n", MI->ModulePtr, + MI->NumTUs); + + static int NextTUIndex = 0; + for (int t = 0; t < MI->NumTUs; ++t) { + OffloadDynamicTUInfo *TU = &MI->TUs[t]; + if (TU->Processed) { + if (isVerboseMode()) + PROF_NOTE("Module %p TU %d already processed, skipping\n", Ptr, t); + continue; + } + int TUIndex = __atomic_fetch_add(&NextTUIndex, 1, __ATOMIC_RELAXED); + if (TU->DeviceVar) { + int CurDev = 0; + hipGetDevice(&CurDev); + const char *ArchName = getDeviceArchName(CurDev); + /* Encode TUIndex in Target so each drain writes a distinct profraw; + * otherwise back-to-back drains overwrite the same file. */ + char TargetWithTU[64]; + snprintf(TargetWithTU, sizeof(TargetWithTU), "%s.%d", ArchName, + TUIndex); + if (processDeviceOffloadPrf(TU->DeviceVar, TUIndex, TargetWithTU) == 0) + TU->Processed = 1; + else + PROF_WARN("failed to process profile data for module %p TU %d\n", Ptr, + t); + } + } + MI->ModulePtr = nullptr; + unlockDynamicModules(); + return; + } + + if (isVerboseMode()) + PROF_WARN("unregister called for unknown module %p\n", Ptr); + unlockDynamicModules(); +} + +/* Grow a void* array, doubling capacity (or starting at InitCap). */ +static int growPtrArray(void ***Arr, int *Num, int *Cap, int InitCap) { + if (*Num < *Cap) + return 0; + int NewCap = *Cap ? *Cap * 2 : InitCap; + void **New = (void **)realloc(*Arr, NewCap * sizeof(void *)); + if (!New) + return -1; + *Arr = New; + *Cap = NewCap; + return 0; +} + +static void **OffloadShadowVariables = nullptr; +static int NumShadowVariables = 0; +static int CapShadowVariables = 0; + +extern "C" void __llvm_profile_offload_register_shadow_variable(void *ptr) { + if (growPtrArray(&OffloadShadowVariables, &NumShadowVariables, + &CapShadowVariables, 64)) + return; + OffloadShadowVariables[NumShadowVariables++] = ptr; +} + +static void **OffloadSectionShadowVariables = nullptr; +static int NumSectionShadowVariables = 0; +static int CapSectionShadowVariables = 0; + +extern "C" void +__llvm_profile_offload_register_section_shadow_variable(void *ptr) { + if (growPtrArray(&OffloadSectionShadowVariables, &NumSectionShadowVariables, + &CapSectionShadowVariables, 64)) + return; + OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr; +} + +namespace { + +// free()-based scope guard. Use .release() to transfer ownership. +struct UniqueFree { + void *Ptr; + explicit UniqueFree(void *P = nullptr) : Ptr(P) {} + ~UniqueFree() { free(Ptr); } + UniqueFree(const UniqueFree &) = delete; + UniqueFree &operator=(const UniqueFree &) = delete; + char *get() const { return static_cast<char *>(Ptr); } + void reset(void *P) { + free(Ptr); + Ptr = P; + } + void *release() { + void *P = Ptr; + Ptr = nullptr; + return P; + } +}; + +} // namespace + +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, + const char *Target) { + __llvm_profile_gpu_sections HostSections; + + if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections), + 2 /*DToH*/) != 0) { + PROF_ERR("%s\n", "failed to copy offload prf structure from device"); + return -1; + } + + const void *DevCntsBegin = HostSections.CountersStart; + const void *DevDataBegin = HostSections.DataStart; + const void *DevNamesBegin = HostSections.NamesStart; + const void *DevCntsEnd = HostSections.CountersStop; + const void *DevDataEnd = HostSections.DataStop; + const void *DevNamesEnd = HostSections.NamesStop; + + size_t CountersSize = (const char *)DevCntsEnd - (const char *)DevCntsBegin; + size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin; + size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin; + + if (isVerboseMode()) + PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu " + "Names=[%p,%p]=%zu\n", + DevCntsBegin, DevCntsEnd, CountersSize, DevDataBegin, DevDataEnd, + DataSize, DevNamesBegin, DevNamesEnd, NamesSize); + + if (CountersSize == 0 || DataSize == 0) + return 0; + + int ret = -1; + int NamesReused = 0, CntsReused = 0, DataReused = 0; + + char *HostDataBegin = nullptr; + char *HostCountersBegin = nullptr; + char *HostNamesBegin = nullptr; + + /* Sections using linker-defined __start_/__stop_ bounds are shared across + TU structs in RDC mode. Deduplicate by caching the last copied range. */ + static const void *CachedDevNamesBegin = nullptr; + static char *CachedHostNames = nullptr; + static size_t CachedNamesSize = 0; + + static const void *CachedDevCntsBegin = nullptr; + static char *CachedHostCnts = nullptr; + static size_t CachedCntsSize = 0; + + static const void *CachedDevDataBegin = nullptr; + static char *CachedHostData = nullptr; + static size_t CachedDataSize = 0; + + // Owns freshly malloc'd buffers; release() transfers ownership to the cache. + UniqueFree CntsOwner, DataOwner, NamesOwner; + + if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin && + CountersSize == CachedCntsSize) { + HostCountersBegin = CachedHostCnts; + CntsReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached counters section (%zu bytes)\n", CountersSize); + } else if (CountersSize > 0) { + HostCountersBegin = (char *)malloc(CountersSize); + CntsOwner.reset(HostCountersBegin); + } + + if (DataSize > 0 && DevDataBegin == CachedDevDataBegin && + DataSize == CachedDataSize) { + HostDataBegin = CachedHostData; + DataReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize); + } else if (DataSize > 0) { + HostDataBegin = (char *)malloc(DataSize); + DataOwner.reset(HostDataBegin); + } + + if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin && + NamesSize == CachedNamesSize) { + HostNamesBegin = CachedHostNames; + NamesReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize); + } else if (NamesSize > 0) { + HostNamesBegin = (char *)malloc(NamesSize); + NamesOwner.reset(HostNamesBegin); + } + + if ((DataSize > 0 && !HostDataBegin) || + (CountersSize > 0 && !HostCountersBegin) || + (NamesSize > 0 && !HostNamesBegin)) { + PROF_ERR("%s\n", "failed to allocate host memory for device sections"); + return -1; + } + + if ((DataSize > 0 && !DataReused && + memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) || + (CountersSize > 0 && !CntsReused && + memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) != + 0) || + (NamesSize > 0 && !NamesReused && + memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } + + /* Cache buffers so RDC-mode multi-shadow drains can reuse them. + * release() prevents the scope guards from freeing what the cache owns. */ + if (!CntsReused && CountersSize > 0) { + CachedDevCntsBegin = DevCntsBegin; + CachedHostCnts = HostCountersBegin; + CachedCntsSize = CountersSize; + CntsOwner.release(); + } + if (!DataReused && DataSize > 0) { + CachedDevDataBegin = DevDataBegin; + CachedHostData = HostDataBegin; + CachedDataSize = DataSize; + DataOwner.release(); + } + if (!NamesReused && NamesSize > 0) { + CachedDevNamesBegin = DevNamesBegin; + CachedHostNames = HostNamesBegin; + CachedNamesSize = NamesSize; + NamesOwner.release(); + } + + if (isVerboseMode()) + PROF_NOTE("Copied device sections: Counters=%zu, Data=%zu, Names=%zu\n", + CountersSize, DataSize, NamesSize); + + // Arrange buffer as [Data][Padding][Counters][Names] to match the layout + // expected by lprofWriteDataImpl (CountersDelta = CountersBegin - DataBegin). + const uint64_t NumData = DataSize / sizeof(__llvm_profile_data); + const uint64_t NumBitmapBytes = 0; + const uint64_t VTableSectionSize = 0; + const uint64_t VNamesSize = 0; + uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters, + PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames, + PaddingBytesAfterVTable, PaddingBytesAfterVNames; + + if (__llvm_profile_get_padding_sizes_for_counters( + DataSize, CountersSize, NumBitmapBytes, NamesSize, VTableSectionSize, + VNamesSize, &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters, + &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames, + &PaddingBytesAfterVTable, &PaddingBytesAfterVNames) != 0) { + PROF_ERR("%s\n", "failed to get padding sizes"); + return -1; + } + + size_t ContiguousBufferSize = + DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize; + UniqueFree ContiguousBuf(malloc(ContiguousBufferSize)); + if (!ContiguousBuf.get()) { + PROF_ERR("%s\n", "failed to allocate contiguous buffer"); + return -1; + } + char *ContiguousBuffer = ContiguousBuf.get(); + __builtin_memset(ContiguousBuffer, 0, ContiguousBufferSize); + + char *BufDataBegin = ContiguousBuffer; + char *BufCountersBegin = + ContiguousBuffer + DataSize + PaddingBytesBeforeCounters; + char *BufNamesBegin = BufCountersBegin + CountersSize; + + __builtin_memcpy(BufDataBegin, HostDataBegin, DataSize); + __builtin_memcpy(BufCountersBegin, HostCountersBegin, CountersSize); + __builtin_memcpy(BufNamesBegin, HostNamesBegin, NamesSize); + + // CounterPtr is a device-relative offset; relocate it for the file layout + // where the Data section precedes Counters. + __llvm_profile_data *RelocatedData = (__llvm_profile_data *)BufDataBegin; + for (uint64_t i = 0; i < NumData; ++i) { + if (RelocatedData[i].CounterPtr) { + ptrdiff_t DeviceCounterPtrOffset = (ptrdiff_t)RelocatedData[i].CounterPtr; + const char *DeviceDataStructAddr = + (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data)); + const char *DeviceCountersAddr = + DeviceDataStructAddr + DeviceCounterPtrOffset; + ptrdiff_t OffsetIntoCountersSection = + DeviceCountersAddr - (const char *)DevCntsBegin; + + ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters + + OffsetIntoCountersSection - + (i * sizeof(__llvm_profile_data)); + __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) + + offsetof(__llvm_profile_data, CounterPtr), + &NewRelativeOffset, sizeof(NewRelativeOffset)); + } + __builtin_memset((char *)RelocatedData + i * sizeof(__llvm_profile_data) + + offsetof(__llvm_profile_data, BitmapPtr), + 0, + sizeof(RelocatedData[i].BitmapPtr) + + sizeof(RelocatedData[i].FunctionPointer) + + sizeof(RelocatedData[i].Values)); + } + + /* Target already encodes TUIndex when needed. */ + (void)TUIndex; + + ret = __llvm_write_custom_profile( + Target, (__llvm_profile_data *)BufDataBegin, + (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin, + BufCountersBegin + CountersSize, BufNamesBegin, BufNamesBegin + NamesSize, + nullptr); + + if (ret != 0) { + PROF_ERR("%s\n", "failed to write device profile using shared API"); + } else if (isVerboseMode()) { + PROF_NOTE("%s\n", "Successfully wrote device profile using shared API"); + } + + return ret; +} + +static int processShadowVariable(void *ShadowVar, int TUIndex, + const char *Target) { + void *DeviceSections = nullptr; + if (hipGetSymbolAddress(&DeviceSections, ShadowVar) != 0) { + PROF_WARN("failed to get symbol address for shadow variable %p\n", + ShadowVar); + return -1; + } + /* DeviceSections points at the per-TU sections struct itself. */ + return processDeviceOffloadPrf(DeviceSections, TUIndex, Target); +} + +static int isHipAvailable(void) { + ensureHipLoaded(); + return pHipMemcpy != nullptr && pHipGetSymbolAddress != nullptr; +} + +/* -------------------------------------------------------------------------- */ +/* Collect device-side profile data */ +/* -------------------------------------------------------------------------- */ + +extern "C" int __llvm_profile_hip_collect_device_data(void) { + if (NumShadowVariables == 0 && NumDynamicModules == 0) + return 0; + + if (!isHipAvailable()) + return 0; + + int Ret = 0; + + /* Shadow variables (static-linked kernels): drain from every device. */ + if (NumShadowVariables > 0) { + int OrigDevice = -1; + hipGetDevice(&OrigDevice); + + for (int Dev = 0; Dev < NumDevices; ++Dev) { + if (hipSetDevice(Dev) != 0) { + if (isVerboseMode()) + PROF_NOTE("Failed to set device %d, skipping\n", Dev); + continue; + } + const char *ArchName = getDeviceArchName(Dev); + if (isVerboseMode()) + PROF_NOTE("Collecting static profile data from device %d (%s)\n", Dev, + ArchName); + for (int i = 0; i < NumShadowVariables; ++i) { + /* RDC-mode multi-shadow drains need a distinct profraw per TU; + * single-TU programs keep the bare arch target. */ + const char *Target = ArchName; + char TargetWithIdx[64]; + if (NumShadowVariables > 1) { + snprintf(TargetWithIdx, sizeof(TargetWithIdx), "%s.%d", ArchName, i); + Target = TargetWithIdx; + } + if (processShadowVariable(OffloadShadowVariables[i], i, Target) != 0) + Ret = -1; + } + } + + if (OrigDevice >= 0) + hipSetDevice(OrigDevice); + } + + /* Warn about unprocessed TUs; skip cleared slots (already drained). */ + lockDynamicModules(); + for (int i = 0; i < NumDynamicModules; ++i) { + OffloadDynamicModuleInfo *MI = &DynamicModules[i]; + if (!MI->ModulePtr) + continue; + for (int t = 0; t < MI->NumTUs; ++t) { + if (!MI->TUs[t].Processed) { + PROF_WARN("dynamic module %p TU %d was not processed before exit\n", + MI->ModulePtr, t); + Ret = -1; + } + } + } + unlockDynamicModules(); + + if (Ret != 0) + PROF_WARN("%s\n", "failed to collect device profile data"); + return Ret; +} + +/* Interceptors for hipModuleLoad* / hipModuleUnload. Linux only. */ + +#if defined(__linux__) && !defined(_WIN32) + +INTERCEPTOR(int, hipModuleLoad, void **module, const char *fname) { + int rc = REAL(hipModuleLoad)(module, fname); + /* Pass NULL image: no in-memory ELF is available for filename loads, + * so the register hook skips symbol enumeration. */ + __llvm_profile_offload_register_dynamic_module(rc, module, nullptr); + return rc; +} + +INTERCEPTOR(int, hipModuleLoadData, void **module, const void *image) { + int rc = REAL(hipModuleLoadData)(module, image); + __llvm_profile_offload_register_dynamic_module(rc, module, image); + return rc; +} + +INTERCEPTOR(int, hipModuleLoadDataEx, void **module, const void *image, + unsigned numOptions, void **options, void **optionValues) { + int rc = REAL(hipModuleLoadDataEx)(module, image, numOptions, options, + optionValues); + __llvm_profile_offload_register_dynamic_module(rc, module, image); + return rc; +} + +INTERCEPTOR(int, hipModuleUnload, void *module) { + /* Drain counters before the module is destroyed; device addresses + * captured at register time are invalid after unload. */ + __llvm_profile_offload_unregister_dynamic_module(module); + return REAL(hipModuleUnload)(module); +} + +__attribute__((constructor)) static void installHipModuleInterceptors() { + /* Skip when the HIP runtime is not loaded. INTERCEPT_FUNCTION uses the + * sanitizer interception framework, which can perturb dlsym/PLT state for + * the rest of the process even when the target symbol is absent; non-HIP + * programs linked with libclang_rt.profile.a must see zero side effects. */ + if (!dlsym(RTLD_DEFAULT, "hipModuleLoad")) + return; + if (!INTERCEPT_FUNCTION(hipModuleLoad)) + return; + if (isVerboseMode()) + PROF_NOTE("%s", "Installing hipModuleLoad*/hipModuleUnload interceptors\n"); + INTERCEPT_FUNCTION(hipModuleLoadData); + INTERCEPT_FUNCTION(hipModuleLoadDataEx); + INTERCEPT_FUNCTION(hipModuleUnload); +} + +#endif /* __linux__ */ diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index cbdc48a9a717f..37bad559f49e7 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -500,6 +500,9 @@ def DEOPTIMIZE : RuntimeLibcall; // Return address def RETURN_ADDRESS : RuntimeLibcall; +// GPU profiling +def PROFILE_INSTRUMENT_GPU : RuntimeLibcall; + // Clear cache def CLEAR_CACHE : RuntimeLibcall; def RISCV_FLUSH_ICACHE : RuntimeLibcall; @@ -2241,8 +2244,10 @@ def WindowsARM64ECSystemLibrary def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">; -// No calls. -def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>; +def __llvm_profile_instrument_gpu : RuntimeLibcallImpl<PROFILE_INSTRUMENT_GPU>; + +def AMDGPUSystemLibrary + : SystemRuntimeLibrary<isAMDGPU, (add __llvm_profile_instrument_gpu)>; //===----------------------------------------------------------------------===// // ARM Runtime Libcalls diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index dffc58281c2d9..b7f917ec39b70 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -122,6 +122,11 @@ inline StringRef getInstrProfValueProfMemOpFuncName() { /// Return the prefix of the name of the variables to function as a filter. inline StringRef getInstrProfVarPrefix() { return "__prof"; } +/// Return the name of the GPU wave-cooperative counter increment helper. +inline StringRef getInstrProfInstrumentGPUFuncName() { + return INSTR_PROF_INSTRUMENT_GPU_FUNC_STR; +} + /// Return the name prefix of variables containing instrumented function names. inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index dabd495cddd49..8e4ba41919768 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -33,14 +34,17 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" @@ -287,6 +291,8 @@ class InstrLowerer final { GlobalVariable *NamesVar = nullptr; size_t NamesSize = 0; + StructType *ProfileDataTy = nullptr; + // vector of counter load/store pairs to be register promoted. std::vector<LoadStorePair> PromotionCandidates; @@ -407,6 +413,9 @@ class InstrLowerer final { /// Create a static initializer for our data, on platforms that need it, /// and for any profile output file that was specified. void emitInitialization(); + + /// Return the __llvm_profile_data struct type. + StructType *getProfileDataTy(); }; /// @@ -1190,19 +1199,22 @@ void InstrLowerer::lowerTimestamp( void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) { auto *Addr = getCounterAddress(Inc); - IRBuilder<> Builder(Inc); if (isGPUProfTarget(M)) { - auto *I64Ty = Builder.getInt64Ty(); + auto *Int64Ty = Builder.getInt64Ty(); auto *PtrTy = Builder.getPtrTy(); auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()), - {PtrTy, PtrTy, I64Ty}, false); - auto Callee = - M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy); + {PtrTy, PtrTy, Int64Ty}, false); + FunctionCallee Callee = + M.getOrInsertFunction(RTLIB::RuntimeLibcallsInfo::getLibcallImplName( + RTLIB::impl___llvm_profile_instrument_gpu), + CalleeTy); Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy); Value *Uniform = ConstantPointerNull::get(PointerType::getUnqual(M.getContext())); - Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()}); + Value *StepI64 = + Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64"); + Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64}); } else if (Options.Atomic || AtomicCounterUpdateAll || (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) { Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(), @@ -1400,6 +1412,12 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) { if (shouldUsePublicSymbol(Fn)) return Fn; + // For GPU targets, weak functions cannot use private aliases because + // LTO may pick a different TU's copy, leaving the alias undefined + if (isGPUProfTarget(*Fn->getParent()) && + GlobalValue::isWeakForLinker(Fn->getLinkage())) + return Fn; + // When possible use a private alias to avoid symbolic relocations. auto *GA = GlobalAlias::create(GlobalValue::LinkageTypes::PrivateLinkage, Fn->getName() + ".local", Fn); @@ -1623,11 +1641,15 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc, } Ptr->setVisibility(Visibility); - // Put the counters and bitmaps in their own sections so linkers can - // remove unneeded sections. Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat())); Ptr->setLinkage(Linkage); - maybeSetComdat(Ptr, Fn, VarName); + if (isGPUProfTarget(M) && !Ptr->hasComdat()) { + Ptr->setComdat(M.getOrInsertComdat(VarName)); + Ptr->setLinkage(GlobalValue::LinkOnceODRLinkage); + Ptr->setVisibility(GlobalValue::ProtectedVisibility); + } else { + maybeSetComdat(Ptr, Fn, VarName); + } return Ptr; } @@ -1799,7 +1821,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); - auto *CounterPtr = PD.RegionCounters; + + Constant *CounterPtr = PD.RegionCounters; uint64_t NumBitmapBytes = PD.NumBitmapBytes; @@ -1807,11 +1830,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); auto *Int16Ty = Type::getInt16Ty(Ctx); auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); - Type *DataTypes[] = { -#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, -#include "llvm/ProfileData/InstrProfData.inc" - }; - auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes)); + auto *DataTy = getProfileDataTy(); Constant *FunctionAddr = getFuncAddrForProfData(Fn); @@ -1819,6 +1838,15 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + if (isGPUProfTarget(M)) { + // For GPU targets, weak functions need weak linkage for their profile data + // aliases to allow linker deduplication across TUs + if (GlobalValue::isWeakForLinker(Fn->getLinkage())) + Linkage = Fn->getLinkage(); + else + Linkage = GlobalValue::ExternalLinkage; + Visibility = GlobalValue::ProtectedVisibility; + } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1830,19 +1858,22 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + if (!isGPUProfTarget(M) && NS == 0 && + !(DataReferencedByCode && NeedComdat && !Renamed) && (TT.isOSBinFormatELF() || (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } - // AMDGPU objects are always ET_DYN, so non-local symbols with default - // visibility are preemptible. The CounterPtr label difference emits a REL32 - // relocation that lld rejects against preemptible targets. - if (TT.isAMDGPU() && !GlobalValue::isLocalLinkage(Linkage)) + // GPU-target ELF objects are always ET_DYN, so non-local symbols with + // default visibility are preemptible. The CounterPtr label difference + // emits a REL32 relocation that lld rejects against preemptible targets. + if (TT.isGPU() && TT.isOSBinFormatELF() && + !GlobalValue::isLocalLinkage(Linkage)) Visibility = GlobalValue::ProtectedVisibility; auto *Data = new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName); + Constant *RelativeCounterPtr; GlobalVariable *BitmapPtr = PD.RegionBitmaps; Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0); @@ -1883,7 +1914,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { Data->setSection( getInstrProfSectionName(DataSectionKind, TT.getObjectFormat())); Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); - maybeSetComdat(Data, Fn, CntsVarName); + if (isGPUProfTarget(M) && !Data->hasComdat()) { + Data->setComdat(M.getOrInsertComdat(CntsVarName)); + Data->setLinkage(GlobalValue::LinkOnceODRLinkage); + } else { + maybeSetComdat(Data, Fn, CntsVarName); + } PD.DataVar = Data; @@ -1961,16 +1997,18 @@ void InstrLowerer::emitNameData() { auto &Ctx = M.getContext(); auto *NamesVal = ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); - NamesVar = new GlobalVariable(M, NamesVal->getType(), true, - GlobalValue::PrivateLinkage, NamesVal, - getInstrProfNamesVarName()); + std::string NamesVarName = std::string(getInstrProfNamesVarName()); + NamesVar = + new GlobalVariable(M, NamesVal->getType(), true, + GlobalValue::PrivateLinkage, NamesVal, NamesVarName); NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); - NamesVar->setSection( + std::string NamesSectionName = ProfileCorrelate == InstrProfCorrelator::BINARY ? getInstrProfSectionName(IPSK_covname, TT.getObjectFormat()) - : getInstrProfSectionName(IPSK_name, TT.getObjectFormat())); + : getInstrProfSectionName(IPSK_name, TT.getObjectFormat()); + NamesVar->setSection(NamesSectionName); // On COFF, it's important to reduce the alignment down to 1 to prevent the // linker from inserting padding before the start of the names section or // between names entries. @@ -2179,3 +2217,22 @@ void createProfileSamplingVar(Module &M) { appendToCompilerUsed(M, SamplingVar); } } // namespace llvm + +// For GPU targets: Allocate contiguous arrays for all profile data. +// This solves the linker reordering problem by using ONE symbol per section +// type, so there's nothing for the linker to reorder. +StructType *InstrLowerer::getProfileDataTy() { + if (ProfileDataTy) + return ProfileDataTy; + + auto &Ctx = M.getContext(); + auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); + auto *Int16Ty = Type::getInt16Ty(Ctx); + auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); + Type *DataTypes[] = { +#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, +#include "llvm/ProfileData/InstrProfData.inc" + }; + ProfileDataTy = StructType::get(Ctx, ArrayRef(DataTypes)); + return ProfileDataTy; +} diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index db032d6fcad45..b6d07aa821e7f 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -380,7 +380,8 @@ class FunctionInstrumenter final { // another counter range within the context. bool isValueProfilingDisabled() const { return DisableValueProfiling || - InstrumentationType == PGOInstrumentationType::CTXPROF; + InstrumentationType == PGOInstrumentationType::CTXPROF || + M.getTargetTriple().isGPU(); } bool shouldInstrumentEntryBB() const { diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll new file mode 100644 index 0000000000000..efe53ab1ebdfb --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll @@ -0,0 +1,32 @@ +;; Test basic AMDGPU PGO instrumentation lowering. +;; Verifies that each instrumentation point lowers directly to a call to +;; __llvm_profile_instrument_gpu with a null uniform-counter argument. + +; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s + +@__hip_cuid_test01 = addrspace(1) global i8 0 +@__profn_test_kernel = private constant [11 x i8] c"test_kernel" + +define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out, i32 %n) { +entry: + call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 0) + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 1) + store i32 1, ptr addrspace(1) %out + br label %if.end + +if.end: + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) + +; CHECK-LABEL: define {{.*}} @test_kernel +; CHECK-NOT: @__llvm_profile_sampling_gpu +; CHECK: call void @__llvm_profile_instrument_gpu( +; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) @__profc_test_kernel to ptr), ptr null, i64 1) +; CHECK: call void @__llvm_profile_instrument_gpu( +; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([4 x i64], ptr addrspace(1) @__profc_test_kernel, i32 0, i32 1) to ptr), ptr null, i64 1) diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll new file mode 100644 index 0000000000000..eab78fb3591b1 --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll @@ -0,0 +1,26 @@ +;; Per-kernel __profc_* arrays land in section __llvm_prf_cnts with one slot +;; per counter, and counter increments lower to __llvm_profile_instrument_gpu +;; calls whose pointer argument is a GEP into the per-kernel array. + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instrprof < %s | FileCheck %s + +@__profn_kernel1 = private constant [7 x i8] c"kernel1" +@__profn_kernel2 = private constant [7 x i8] c"kernel2" + +; CHECK: @__profc_kernel1 = linkonce_odr protected addrspace(1) global [2 x i64] zeroinitializer, section "__llvm_prf_cnts" +; CHECK: @__profc_kernel2 = linkonce_odr protected addrspace(1) global [1 x i64] zeroinitializer, section "__llvm_prf_cnts" + +define amdgpu_kernel void @kernel1() { + call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 0) + call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 1) + ret void +} + +define amdgpu_kernel void @kernel2() { + call void @llvm.instrprof.increment(ptr @__profn_kernel2, i64 67890, i32 1, i32 0) + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) + +; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([2 x i64], ptr addrspace(1) @__profc_kernel1, i32 0, i32 1) to ptr), ptr null, i64 1) diff --git a/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll new file mode 100644 index 0000000000000..ce16f1ee3215f --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -passes=instrprof -S | FileCheck %s + +; Test that weak functions on GPU targets get weak linkage for their +; __profd_ aliases to allow linker deduplication across TUs. +; Non-weak functions get external linkage (default for aliases). + +target triple = "amdgcn-amd-amdhsa" + +@__hip_cuid_abc123 = addrspace(1) global i8 0 + +; AMDGPU GPU profiling lowers to per-function comdat globals (not aliases). +; CHECK: @__profd_weak_func = linkonce_odr protected addrspace(1) global +@__profn_weak_func = private constant [9 x i8] c"weak_func" + +define weak void @weak_func() { + call void @llvm.instrprof.increment(ptr @__profn_weak_func, i64 0, i32 1, i32 0) + ret void +} + +; CHECK: @__profd_weak_odr_func = linkonce_odr protected addrspace(1) global +@__profn_weak_odr_func = private constant [13 x i8] c"weak_odr_func" + +define weak_odr void @weak_odr_func() { + call void @llvm.instrprof.increment(ptr @__profn_weak_odr_func, i64 0, i32 1, i32 0) + ret void +} + +; CHECK: @__profd_normal_func = linkonce_odr protected addrspace(1) global +@__profn_normal_func = private constant [11 x i8] c"normal_func" + +define void @normal_func() { + call void @llvm.instrprof.increment(ptr @__profn_normal_func, i64 0, i32 1, i32 0) + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) diff --git a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll new file mode 100644 index 0000000000000..21b1d68004b13 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll @@ -0,0 +1,22 @@ +;; Test that value profiling (indirect call profiling) is disabled for GPU targets. +;; The device-side profiling runtime does not implement +;; __llvm_profile_instrument_target, so indirect call profiling must not be emitted. + +; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +@fptr = addrspace(1) global ptr null, align 8 + +;; Verify that regular block instrumentation IS emitted +; CHECK: call void @llvm.instrprof.increment + +;; Verify that value profiling for indirect calls is NOT emitted +; CHECK-NOT: call void @llvm.instrprof.value.profile + +define amdgpu_kernel void @test_indirect_call() { +entry: + %fp = load ptr, ptr addrspace(1) @fptr, align 8 + call void %fp() + ret void +} >From 10cbf51f82f91f116542de39216d0d509d8cc9bd Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Wed, 3 Jun 2026 15:33:59 -0400 Subject: [PATCH 2/2] Reland "[PGO][AMDGPU] Add basic HIP offload PGO support (#177665)" This relands commit 5db13643f4b7 and its merged follow-ups, which were all reverted together in 6cfa1a01a0f7 (#201416): 635e120fb873 [PGO][HIP] Stop pulling ROCm.o into every PGO host link (#200101) 2766733764f4 [compiler-rt][profile] Add COMPILER_RT_BUILD_PROFILE_ROCM option (#200127) 4c33844b4b56 [PGO][HIP] Skip ROCm interceptor in profile-only compiler-rt builds (#200111) 5eca8b67ff3c [PGO][HIP] Fix profile-only Windows link by gating ROCm interceptor macro (#200859) The revert was done because the merged change still broke Windows profiling builds with the static CRT (the interceptor merge forced the profile library to /MD). That last regression had no merged fix yet; it is addressed by the follow-up commit in this reland, so the series is green on Windows from the start. This reverts commit 6cfa1a01a0f737ed1d54963810057fb1bd67a274. --- compiler-rt/CMakeLists.txt | 7 ++++++- compiler-rt/lib/profile/CMakeLists.txt | 11 +++++++---- compiler-rt/test/profile/CMakeLists.txt | 8 ++++++-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index 39034fd9ba67d..2693b35cad5c7 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -323,7 +323,12 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt atomic instead of libatom option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" OFF) set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON) -if(APPLE) +if(APPLE OR WIN32) + # On Windows the HIP host interceptor (InstrProfilingPlatformROCm.cpp) pulls in + # RTInterception + sanitizer_common, which are built /MD and force the whole + # profile library to /MD, breaking the default static-CRT (/MT) profile builds. + # Device PGO on Windows is not wired up yet, so default this off; users who need + # it can opt in explicitly and accept the /MD profile library. set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF) endif() option(COMPILER_RT_BUILD_PROFILE_ROCM diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 77db2477bb7c6..02c94cd2c4db2 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -161,12 +161,15 @@ endif() # The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in # RTInterception + sanitizer_common object libs. Those targets are only created # when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is enabled -# (see lib/CMakeLists.txt). In a profile-only build the targets do not exist; -# skip both the object-lib merge and the ROCm source file so the static archive -# remains self-contained. +# (see lib/CMakeLists.txt). Only merge them (and keep the ROCm source) when ROCm +# device PGO is requested and the targets exist; otherwise skip both so the +# static archive stays self-contained. This also keeps the profile library on +# the static CRT (/MT) on Windows by default, since the merged sanitizer object +# libs are built /MD. set(PROFILE_OBJECT_LIBS) set(PROFILE_HAS_HIP_INTERCEPTOR FALSE) -if(COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL +if(COMPILER_RT_BUILD_PROFILE_ROCM + AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH} AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH} AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH}) diff --git a/compiler-rt/test/profile/CMakeLists.txt b/compiler-rt/test/profile/CMakeLists.txt index 213a05032ed80..7e79c054328a8 100644 --- a/compiler-rt/test/profile/CMakeLists.txt +++ b/compiler-rt/test/profile/CMakeLists.txt @@ -28,8 +28,12 @@ foreach(arch ${PROFILE_TEST_ARCH}) # with MultiThreadedDLL (/MD), so the .objs reference __imp_* symbols; # the test binary defaults to /MT and fails to link (LNK2019 __imp__stricmp # from interception_win.cpp, LNK4098 default-lib conflicts). Match the - # DLL CRT here so test executables link against the same runtime. - if(MSVC AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL) + # DLL CRT here so test executables link against the same runtime. This must + # mirror the archive's interceptor gate in lib/profile/CMakeLists.txt: the + # libs are only merged (and the archive only goes /MD) when ROCm device PGO + # is enabled, so without it the test binaries stay on the default /MT. + if(MSVC AND COMPILER_RT_BUILD_PROFILE_ROCM + AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL) string(APPEND PROFILE_TEST_TARGET_CFLAGS " -D_MT -D_DLL -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames") endif() _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
