llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-pgo @llvm/pr-subscribers-backend-amdgpu Author: Yaxun (Sam) Liu (yxsamliu) <details> <summary>Changes</summary> Reland of #<!-- -->177665, reverted in #<!-- -->201416 along with its merged follow-ups #<!-- -->200101, #<!-- -->200127, #<!-- -->200111, and #<!-- -->200859. The revert was triggered because the merged change still broke Windows profiling builds that link `clang_rt.profile` with the static CRT (`/MT`): the ROCm collection runtime pulls in `RTInterception` + `sanitizer_common`, which are built `/MD`, forcing the profile archive to `/MD` and breaking static-CRT consumers (`LNK2019 __imp_getpid` …). This reland restores the reverted commits and fixes that last regression: `COMPILER_RT_BUILD_PROFILE_ROCM` now defaults off on Windows (already off on Apple), with the interceptor object-lib merge and the `Profile-*` test `/MD` CRT model gated on it. The Windows archive stays `/MT`, so static-CRT consumers link again. Windows device PGO is not wired up end-to-end yet, so nothing is lost; opt in with `-DCOMPILER_RT_BUILD_PROFILE_ROCM=ON` to get the `/MD` archive. --- Patch is 71.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/201499.diff 15 Files Affected: - (modified) clang/lib/CodeGen/CGCUDANV.cpp (+152) - (added) clang/test/CodeGenHIP/offload-pgo-sections.hip (+50) - (modified) compiler-rt/CMakeLists.txt (+14) - (modified) compiler-rt/lib/profile/CMakeLists.txt (+59-3) - (modified) compiler-rt/lib/profile/InstrProfilingFile.c (+30) - (added) compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp (+897) - (modified) compiler-rt/test/profile/CMakeLists.txt (+6-2) - (modified) llvm/include/llvm/IR/RuntimeLibcalls.td (+7-2) - (modified) llvm/include/llvm/ProfileData/InstrProf.h (+5) - (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+83-26) - (modified) llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp (+2-1) - (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll (+32) - (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll (+26) - (added) llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll (+36) - (added) llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll (+22) ``````````diff diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 259b6c040706b..65f398af7902b 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/Format.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" using namespace clang; using namespace CodeGen; @@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::GlobalVariable *GpuBinaryHandle = nullptr; + /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global, + /// emitted only for HIP host compiles when PGO is on. Registered via + /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime + /// can locate the device-side table by name. + llvm::GlobalVariable *OffloadProfShadow = nullptr; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; /// Mangle context for device. @@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime { void transformManagedVars(); /// Create offloading entries to register globals in RDC mode. void createOffloadingEntries(); + /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global. + /// On the device side it is the populated 7-pointer section-bounds table. + /// On the host side it is a placeholder void* shadow stored in + /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC) + /// or createOffloadingEntries (RDC) so the runtime can locate the + /// device-side table by name. + void emitOffloadProfilingSections(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { } } + // Register the per-TU offload-profiling shadow so the host runtime can + // locate the matching device-side __llvm_profile_sections_<CUID>. We + // emit both __hipRegisterVar (so the HIP runtime can map the host + // shadow to the device symbol) and + // __llvm_profile_offload_register_shadow_variable (so the profile + // runtime adds the shadow to its drain list). + if (OffloadProfShadow) { + llvm::Constant *Name = + makeConstantString(std::string(OffloadProfShadow->getName())); + llvm::Value *RegisterVarArgs[] = { + &GpuBinaryHandlePtr, + OffloadProfShadow, + Name, + Name, + llvm::ConstantInt::get(IntTy, /*Extern=*/0), + llvm::ConstantInt::get(VarSizeTy, CGM.getDataLayout().getPointerSize()), + llvm::ConstantInt::get(IntTy, /*Constant=*/0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, RegisterVarArgs); + + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); + } + Builder.CreateRetVoid(); return RegisterKernelsFunc; } @@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() { I.Flags.getSurfTexType()); } } + + // Register the per-TU offload-profiling shadow. The offloading entry + // makes the linker-wrapper emit the host __hipRegisterVar call in the + // combined ctor. Separately emit a per-TU ctor that registers the + // shadow with the profile runtime's drain list. + if (OffloadProfShadow) { + llvm::offloading::emitOffloadingEntry( + M, Kind, OffloadProfShadow, OffloadProfShadow->getName(), + CGM.getDataLayout().getPointerSize(), + llvm::offloading::OffloadGlobalEntry, /*Data=*/0); + + llvm::LLVMContext &Ctx = M.getContext(); + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + auto *CtorFn = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, false), + llvm::GlobalValue::InternalLinkage, + "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), &M); + auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); + llvm::IRBuilder<> B(Entry); + B.CreateCall(RegisterShadow, {OffloadProfShadow}); + B.CreateRetVoid(); + llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); + } +} + +// For HIP host+device compiles with PGO enabled, emit the per-TU global +// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding +// section start/stop bounds for the names/counters/data sections plus the +// raw-version variable. Host side: an opaque void* shadow whose only +// purpose is to give the host-runtime a registered symbol name to look up +// via hipGetSymbolAddress; the actual device-side data lives in the +// matching device-side global. +void CGNVCUDARuntime::emitOffloadProfilingSections() { + if (!CGM.getLangOpts().HIP) + return; + if (!CGM.getCodeGenOpts().hasProfileInstr()) + return; + + StringRef CUIDHash = CGM.getContext().getCUIDHash(); + if (CUIDHash.empty()) + return; + + llvm::Module &M = CGM.getModule(); + llvm::LLVMContext &Ctx = M.getContext(); + std::string Name = ("__llvm_profile_sections_" + CUIDHash).str(); + + // If the global already exists (e.g. another TU was merged in), don't + // duplicate it. + if (M.getNamedValue(Name)) + return; + + if (CGM.getLangOpts().CUDAIsDevice) { + // Device side: emit the populated struct. Section start/stop symbols + // are linker-defined (ELF auto-generates __start_/__stop_ for any + // section whose name is a valid C identifier; AMDGPU is ELF). + unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); + auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); + auto getOrDeclare = [&](StringRef SymName) { + if (auto *GV = M.getNamedGlobal(SymName)) + return GV; + auto *GV = new llvm::GlobalVariable( + M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::HiddenVisibility); + return GV; + }; + auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); + if (!VersionGV) { + VersionGV = new llvm::GlobalVariable( + M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, + "__llvm_profile_raw_version", + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + } + + auto *StructTy = llvm::StructType::get( + Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); + llvm::Constant *Fields[] = { + getOrDeclare("__start___llvm_prf_names"), + getOrDeclare("__stop___llvm_prf_names"), + getOrDeclare("__start___llvm_prf_cnts"), + getOrDeclare("__stop___llvm_prf_cnts"), + getOrDeclare("__start___llvm_prf_data"), + getOrDeclare("__stop___llvm_prf_data"), + VersionGV, + }; + auto *Init = llvm::ConstantStruct::get(StructTy, Fields); + auto *GV = new llvm::GlobalVariable( + M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, + Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); + CGM.addCompilerUsedGlobal(GV); + return; + } + + // Host side: emit an opaque void* shadow. Layout doesn't matter — the + // runtime locates it by name via hipGetSymbolAddress and treats it as + // the address of the device-side struct. Registration with the HIP + // runtime is added by makeRegisterGlobalsFn (non-RDC) or + // createOffloadingEntries (RDC). + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + OffloadProfShadow = new llvm::GlobalVariable( + M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), Name); + CGM.addCompilerUsedGlobal(OffloadProfShadow); } // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { transformManagedVars(); + emitOffloadProfilingSections(); if (CGM.getLangOpts().CUDAIsDevice) { // Mark ODR-used device variables as compiler used to prevent it from being // eliminated by optimization. This is necessary for device variables diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip b/clang/test/CodeGenHIP/offload-pgo-sections.hip new file mode 100644 index 0000000000000..17c6fe7b9e609 --- /dev/null +++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip @@ -0,0 +1,50 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global +// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct +// in addrspace(1). Host compile: void* shadow registered with the HIP +// runtime and with the profile runtime's drain list. + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=DEV %s + +// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=HOST %s + +// Guard: no PGO -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +// Guard: no CUID -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ int helper(int x) { return x + 1; } +__global__ void kernel(int *p) { *p = helper(*p); } + +// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64 +// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version +// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] + +// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] +// HOST: define internal void @__hip_register_globals +// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]], +// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) + +// NONE-NOT: __llvm_profile_sections_ +// NONE-NOT: __llvm_profile_offload_register_shadow_variable diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index e88321d822f84..2693b35cad5c7 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -322,6 +322,20 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt atomic instead of libatom option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" OFF) +set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON) +if(APPLE OR WIN32) + # On Windows the HIP host interceptor (InstrProfilingPlatformROCm.cpp) pulls in + # RTInterception + sanitizer_common, which are built /MD and force the whole + # profile library to /MD, breaking the default static-CRT (/MT) profile builds. + # Device PGO on Windows is not wired up yet, so default this off; users who need + # it can opt in explicitly and accept the /MD profile library. + set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF) +endif() +option(COMPILER_RT_BUILD_PROFILE_ROCM + "Build the host-side ROCm/HIP device profile collection runtime" + ${DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM}) +mark_as_advanced(COMPILER_RT_BUILD_PROFILE_ROCM) + include(config-ix) #================================ diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 8d9a773412a22..02c94cd2c4db2 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -93,6 +93,9 @@ if (NOT COMPILER_RT_PROFILE_BAREMETAL) InstrProfilingUtil.c InstrProfilingValue.c ) + if(COMPILER_RT_BUILD_PROFILE_ROCM) + list(APPEND PROFILE_SOURCES InstrProfilingPlatformROCm.cpp) + endif() endif() set(PROFILE_HEADERS @@ -155,6 +158,46 @@ if(COMPILER_RT_PROFILE_BAREMETAL) -DCOMPILER_RT_PROFILE_BAREMETAL=1) endif() +# The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in +# RTInterception + sanitizer_common object libs. Those targets are only created +# when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is enabled +# (see lib/CMakeLists.txt). Only merge them (and keep the ROCm source) when ROCm +# device PGO is requested and the targets exist; otherwise skip both so the +# static archive stays self-contained. This also keeps the profile library on +# the static CRT (/MT) on Windows by default, since the merged sanitizer object +# libs are built /MD. +set(PROFILE_OBJECT_LIBS) +set(PROFILE_HAS_HIP_INTERCEPTOR FALSE) +if(COMPILER_RT_BUILD_PROFILE_ROCM + AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL + AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH} + AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH} + AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH}) + # RTInterception references __sanitizer_internal_{memcpy,memset,memmove} and other + # sanitizer_common symbols; merge the same object libs as clang_rt.cfi (without + # coverage/symbolizer) so -fprofile-instr-generate links stay self-contained. + list(APPEND PROFILE_OBJECT_LIBS + RTInterception + RTSanitizerCommon + RTSanitizerCommonLibc) + set(PROFILE_HAS_HIP_INTERCEPTOR TRUE) +endif() + +if(NOT PROFILE_HAS_HIP_INTERCEPTOR) + list(REMOVE_ITEM PROFILE_SOURCES InstrProfilingPlatformROCm.cpp) +endif() + +# Only advertise the ROCm interceptor to InstrProfilingFile.c when its +# definition (InstrProfilingPlatformROCm.cpp) is actually compiled into the +# archive. Otherwise InstrProfilingFile.c references +# __llvm_profile_hip_collect_device_data with no definition; on COFF/Windows +# there is no weak-undefined fallback, so the link fails (see PR #200111). +if(COMPILER_RT_BUILD_PROFILE_ROCM AND PROFILE_HAS_HIP_INTERCEPTOR) + set(EXTRA_FLAGS + ${EXTRA_FLAGS} + -DCOMPILER_RT_BUILD_PROFILE_ROCM=1) +endif() + if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx") append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding EXTRA_FLAGS) append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib EXTRA_FLAGS) @@ -168,13 +211,24 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx") endif() if(MSVC) - # profile historically has only been supported with the static runtime - # on windows - set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) + # profile historically used the static CRT (/MT). When we merge RTInterception and + # RTSanitizerCommon (same object libs as clang_rt.cfi on ELF), those targets are + # built with MultiThreadedDLL (/MD) — see interception/CMakeLists.txt and + # sanitizer_common/CMakeLists.txt. Mixing /MD objects into a /MT libclang_rt.profile + # yields LNK2019 (__imp__stricmp from interception_win.cpp) and LNK4098 in Profile-*. + if(PROFILE_HAS_HIP_INTERCEPTOR) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL) + else() + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) + endif() endif() # We don't use the C++ Standard Library here, so avoid including it by mistake. append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS) +# C++ profile sources (e.g. InstrProfilingPlatformROCm.cpp) must not emit exception +# personality symbols: host libclang_rt.profile.a is linked from C code and from C++ +# tests that do not pull in __gxx_personality_v0 (Profile-* / premerge). +append_list_if(COMPILER_RT_HAS_FNO_EXCEPTIONS_FLAG -fno-exceptions EXTRA_FLAGS) # XRay uses C++ standard library headers. string(REGEX REPLACE "-?-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") @@ -200,6 +254,7 @@ if(APPLE) STATIC OS ${PROFILE_SUPPORTED_OS} ARCHS ${PROFILE_SUPPORTED_ARCH} + OBJECT_LIBS ${PROFILE_OBJECT_LIBS} CFLAGS ${EXTRA_FLAGS} SOURCES ${PROFILE_SOURCES} ADDITIONAL_HEADERS ${PROFILE_HEADERS} @@ -209,6 +264,7 @@ else() add_compiler_rt_runtime(clang_rt.profile STATIC ARCHS ${PROFILE_SUPPORTED_ARCH} + OBJECT_LIBS ${PROFILE_OBJECT_LIBS} CFLAGS ${EXTRA_FLAGS} SOURCES ${PROFILE_SOURCES} ADDITIONAL_HEADERS ${PROFILE_HEADERS} diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 71127b05aafb8..9ea5a2638fac9 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -41,6 +41,23 @@ #include "InstrProfilingPort.h" #include "InstrProfilingUtil.h" +/* Weak so non-HIP programs do not force InstrProfilingPlatformROCm.o (and its + * transitive sanitizer_common / interception dependencies) into the host link + * out of libclang_rt.profile.a. HIP programs emit strong references to other + * ROCm-runtime symbols (e.g. __llvm_profile_offload_register_shadow_variable) + * that pull in the strong definition. + * No COMPILER_RT_VISIBILITY: a hidden weak-undefined symbol is non-preemptible + * and the address test at the call site would fold to true. + * Windows: __declspec(selectany) is data-only, and the ROCm interceptor path + * is not used there, so keep the original strong extern. */ +#if COMPILER_RT_BUILD_PROFILE_ROCM +#if defined(_WIN32) +extern int __llvm_profile_hip_collect_device_data(void); +#else +__attribute__((weak)) int __llvm_profile_hip_collect_device_data(void); +#endif +#endif + /* From where is profile name specified. * The order the enumerators define their * precedence. Re-order them may lead to @@ -1198,6 +1215,19 @@ int __llvm_profile_write_file(void) { if (rc) PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno)); + /* On non-Windows the declaration is weak: only invoked when + * InstrProfilingPlatformROCm.o is in the link, which happens when the program + * references other ROCm-runtime symbols (HIP-with-PGO). Warning on failure is + * handled inside the callee. */ +#if COMPILER_RT_BUILD_PROFILE_ROCM +#if defined(_WIN32) + (void)__llvm_profile_hip_collect_device_data(); +#else + if (&__llvm_profile_hip_collect_device_data) + (void)__llvm_profile_hip_collect_device_data(); +#endif +#endif + // Restore SIGKILL. if (PDeathSig == 1) lprofRestoreSigKill(); diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp new file mode 100644 index 0000000000000..ee00c572e3a42 --- /dev/null +++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp @@ -0,0 +1,897 @@ +//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +extern "C" { +#include "InstrProfiling.h" +#include "InstrProfilingInternal.h" +#include "InstrProfilingPort.h" +} + +#include "interception/interception.h" +// C librar... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/201499 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
