llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-pgo

@llvm/pr-subscribers-backend-amdgpu

Author: Yaxun (Sam) Liu (yxsamliu)

<details>
<summary>Changes</summary>

Reland of #<!-- -->177665, reverted in #<!-- -->201416 along with its merged 
follow-ups
#<!-- -->200101, #<!-- -->200127, #<!-- -->200111, and #<!-- -->200859.

The revert was triggered because the merged change still broke Windows profiling
builds that link `clang_rt.profile` with the static CRT (`/MT`): the ROCm
collection runtime pulls in `RTInterception` + `sanitizer_common`, which are
built `/MD`, forcing the profile archive to `/MD` and breaking static-CRT
consumers (`LNK2019 __imp_getpid` …).

This reland restores the reverted commits and fixes that last regression:
`COMPILER_RT_BUILD_PROFILE_ROCM` now defaults off on Windows (already off on
Apple), with the interceptor object-lib merge and the `Profile-*` test `/MD` CRT
model gated on it. The Windows archive stays `/MT`, so static-CRT consumers link
again. Windows device PGO is not wired up end-to-end yet, so nothing is lost;
opt in with `-DCOMPILER_RT_BUILD_PROFILE_ROCM=ON` to get the `/MD` archive.


---

Patch is 71.54 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/201499.diff


15 Files Affected:

- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+152) 
- (added) clang/test/CodeGenHIP/offload-pgo-sections.hip (+50) 
- (modified) compiler-rt/CMakeLists.txt (+14) 
- (modified) compiler-rt/lib/profile/CMakeLists.txt (+59-3) 
- (modified) compiler-rt/lib/profile/InstrProfilingFile.c (+30) 
- (added) compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp (+897) 
- (modified) compiler-rt/test/profile/CMakeLists.txt (+6-2) 
- (modified) llvm/include/llvm/IR/RuntimeLibcalls.td (+7-2) 
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+5) 
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+83-26) 
- (modified) llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp (+2-1) 
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll 
(+32) 
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll (+26) 
- (added) llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll (+36) 
- (added) llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll 
(+22) 


``````````diff
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 259b6c040706b..65f398af7902b 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
+  /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
+  /// emitted only for HIP host compiles when PGO is on. Registered via
+  /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
+  /// can locate the device-side table by name.
+  llvm::GlobalVariable *OffloadProfShadow = nullptr;
   /// Whether we generate relocatable device code.
   bool RelocatableDeviceCode;
   /// Mangle context for device.
@@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   void transformManagedVars();
   /// Create offloading entries to register globals in RDC mode.
   void createOffloadingEntries();
+  /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
+  /// On the device side it is the populated 7-pointer section-bounds table.
+  /// On the host side it is a placeholder void* shadow stored in
+  /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
+  /// or createOffloadingEntries (RDC) so the runtime can locate the
+  /// device-side table by name.
+  void emitOffloadProfilingSections();
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
@@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     }
   }
 
+  // Register the per-TU offload-profiling shadow so the host runtime can
+  // locate the matching device-side __llvm_profile_sections_<CUID>. We
+  // emit both __hipRegisterVar (so the HIP runtime can map the host
+  // shadow to the device symbol) and
+  // __llvm_profile_offload_register_shadow_variable (so the profile
+  // runtime adds the shadow to its drain list).
+  if (OffloadProfShadow) {
+    llvm::Constant *Name =
+        makeConstantString(std::string(OffloadProfShadow->getName()));
+    llvm::Value *RegisterVarArgs[] = {
+        &GpuBinaryHandlePtr,
+        OffloadProfShadow,
+        Name,
+        Name,
+        llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+        llvm::ConstantInt::get(VarSizeTy, 
CGM.getDataLayout().getPointerSize()),
+        llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+        llvm::ConstantInt::get(IntTy, 0)};
+    Builder.CreateCall(RegisterVar, RegisterVarArgs);
+
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
+  }
+
   Builder.CreateRetVoid();
   return RegisterKernelsFunc;
 }
@@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() {
           I.Flags.getSurfTexType());
     }
   }
+
+  // Register the per-TU offload-profiling shadow. The offloading entry
+  // makes the linker-wrapper emit the host __hipRegisterVar call in the
+  // combined ctor. Separately emit a per-TU ctor that registers the
+  // shadow with the profile runtime's drain list.
+  if (OffloadProfShadow) {
+    llvm::offloading::emitOffloadingEntry(
+        M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
+        CGM.getDataLayout().getPointerSize(),
+        llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+
+    llvm::LLVMContext &Ctx = M.getContext();
+    auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    auto *CtorFn = llvm::Function::Create(
+        llvm::FunctionType::get(VoidTy, false),
+        llvm::GlobalValue::InternalLinkage,
+        "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), 
&M);
+    auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
+    llvm::IRBuilder<> B(Entry);
+    B.CreateCall(RegisterShadow, {OffloadProfShadow});
+    B.CreateRetVoid();
+    llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
+  }
+}
+
+// For HIP host+device compiles with PGO enabled, emit the per-TU global
+// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding
+// section start/stop bounds for the names/counters/data sections plus the
+// raw-version variable. Host side: an opaque void* shadow whose only
+// purpose is to give the host-runtime a registered symbol name to look up
+// via hipGetSymbolAddress; the actual device-side data lives in the
+// matching device-side global.
+void CGNVCUDARuntime::emitOffloadProfilingSections() {
+  if (!CGM.getLangOpts().HIP)
+    return;
+  if (!CGM.getCodeGenOpts().hasProfileInstr())
+    return;
+
+  StringRef CUIDHash = CGM.getContext().getCUIDHash();
+  if (CUIDHash.empty())
+    return;
+
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = M.getContext();
+  std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
+
+  // If the global already exists (e.g. another TU was merged in), don't
+  // duplicate it.
+  if (M.getNamedValue(Name))
+    return;
+
+  if (CGM.getLangOpts().CUDAIsDevice) {
+    // Device side: emit the populated struct. Section start/stop symbols
+    // are linker-defined (ELF auto-generates __start_/__stop_ for any
+    // section whose name is a valid C identifier; AMDGPU is ELF).
+    unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+    auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
+    auto getOrDeclare = [&](StringRef SymName) {
+      if (auto *GV = M.getNamedGlobal(SymName))
+        return GV;
+      auto *GV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+      GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+      return GV;
+    };
+    auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
+    if (!VersionGV) {
+      VersionGV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
+          "__llvm_profile_raw_version",
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+    }
+
+    auto *StructTy = llvm::StructType::get(
+        Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
+    llvm::Constant *Fields[] = {
+        getOrDeclare("__start___llvm_prf_names"),
+        getOrDeclare("__stop___llvm_prf_names"),
+        getOrDeclare("__start___llvm_prf_cnts"),
+        getOrDeclare("__stop___llvm_prf_cnts"),
+        getOrDeclare("__start___llvm_prf_data"),
+        getOrDeclare("__stop___llvm_prf_data"),
+        VersionGV,
+    };
+    auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
+    auto *GV = new llvm::GlobalVariable(
+        M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+        Init, Name, /*InsertBefore=*/nullptr, 
llvm::GlobalValue::NotThreadLocal,
+        GlobalAS);
+    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
+    CGM.addCompilerUsedGlobal(GV);
+    return;
+  }
+
+  // Host side: emit an opaque void* shadow. Layout doesn't matter — the
+  // runtime locates it by name via hipGetSymbolAddress and treats it as
+  // the address of the device-side struct. Registration with the HIP
+  // runtime is added by makeRegisterGlobalsFn (non-RDC) or
+  // createOffloadingEntries (RDC).
+  auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+  OffloadProfShadow = new llvm::GlobalVariable(
+      M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+      llvm::ConstantPointerNull::get(PtrTy), Name);
+  CGM.addCompilerUsedGlobal(OffloadProfShadow);
 }
 
 // Returns module constructor to be added.
 llvm::Function *CGNVCUDARuntime::finalizeModule() {
   transformManagedVars();
+  emitOffloadProfilingSections();
   if (CGM.getLangOpts().CUDAIsDevice) {
     // Mark ODR-used device variables as compiler used to prevent it from being
     // eliminated by optimization. This is necessary for device variables
diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip 
b/clang/test/CodeGenHIP/offload-pgo-sections.hip
new file mode 100644
index 0000000000000..17c6fe7b9e609
--- /dev/null
+++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip
@@ -0,0 +1,50 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: x86-registered-target
+
+// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global
+// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct
+// in addrspace(1). Host compile: void* shadow registered with the HIP
+// runtime and with the profile runtime's drain list.
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=HOST %s
+
+// Guard: no PGO -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+// Guard: no CUID -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ int helper(int x) { return x + 1; }
+__global__ void kernel(int *p) { *p = helper(*p); }
+
+// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64
+// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) 
constant 
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
+// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+
+// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+// HOST: define internal void @__hip_register_globals
+// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]],
+// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+
+// NONE-NOT: __llvm_profile_sections_
+// NONE-NOT: __llvm_profile_offload_register_shadow_variable
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index e88321d822f84..2693b35cad5c7 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -322,6 +322,20 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt 
atomic instead of libatom
 
 option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" 
OFF)
 
+set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON)
+if(APPLE OR WIN32)
+  # On Windows the HIP host interceptor (InstrProfilingPlatformROCm.cpp) pulls 
in
+  # RTInterception + sanitizer_common, which are built /MD and force the whole
+  # profile library to /MD, breaking the default static-CRT (/MT) profile 
builds.
+  # Device PGO on Windows is not wired up yet, so default this off; users who 
need
+  # it can opt in explicitly and accept the /MD profile library.
+  set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF)
+endif()
+option(COMPILER_RT_BUILD_PROFILE_ROCM
+  "Build the host-side ROCm/HIP device profile collection runtime"
+  ${DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM})
+mark_as_advanced(COMPILER_RT_BUILD_PROFILE_ROCM)
+
 include(config-ix)
 
 #================================
diff --git a/compiler-rt/lib/profile/CMakeLists.txt 
b/compiler-rt/lib/profile/CMakeLists.txt
index 8d9a773412a22..02c94cd2c4db2 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -93,6 +93,9 @@ if (NOT COMPILER_RT_PROFILE_BAREMETAL)
     InstrProfilingUtil.c
     InstrProfilingValue.c
     )
+  if(COMPILER_RT_BUILD_PROFILE_ROCM)
+    list(APPEND PROFILE_SOURCES InstrProfilingPlatformROCm.cpp)
+  endif()
 endif()
 
 set(PROFILE_HEADERS
@@ -155,6 +158,46 @@ if(COMPILER_RT_PROFILE_BAREMETAL)
      -DCOMPILER_RT_PROFILE_BAREMETAL=1)
 endif()
 
+# The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in
+# RTInterception + sanitizer_common object libs. Those targets are only created
+# when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is 
enabled
+# (see lib/CMakeLists.txt). Only merge them (and keep the ROCm source) when 
ROCm
+# device PGO is requested and the targets exist; otherwise skip both so the
+# static archive stays self-contained. This also keeps the profile library on
+# the static CRT (/MT) on Windows by default, since the merged sanitizer object
+# libs are built /MD.
+set(PROFILE_OBJECT_LIBS)
+set(PROFILE_HAS_HIP_INTERCEPTOR FALSE)
+if(COMPILER_RT_BUILD_PROFILE_ROCM
+   AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL
+   AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH}
+   AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH}
+   AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH})
+  # RTInterception references __sanitizer_internal_{memcpy,memset,memmove} and 
other
+  # sanitizer_common symbols; merge the same object libs as clang_rt.cfi 
(without
+  # coverage/symbolizer) so -fprofile-instr-generate links stay self-contained.
+  list(APPEND PROFILE_OBJECT_LIBS
+    RTInterception
+    RTSanitizerCommon
+    RTSanitizerCommonLibc)
+  set(PROFILE_HAS_HIP_INTERCEPTOR TRUE)
+endif()
+
+if(NOT PROFILE_HAS_HIP_INTERCEPTOR)
+  list(REMOVE_ITEM PROFILE_SOURCES InstrProfilingPlatformROCm.cpp)
+endif()
+
+# Only advertise the ROCm interceptor to InstrProfilingFile.c when its
+# definition (InstrProfilingPlatformROCm.cpp) is actually compiled into the
+# archive. Otherwise InstrProfilingFile.c references
+# __llvm_profile_hip_collect_device_data with no definition; on COFF/Windows
+# there is no weak-undefined fallback, so the link fails (see PR #200111).
+if(COMPILER_RT_BUILD_PROFILE_ROCM AND PROFILE_HAS_HIP_INTERCEPTOR)
+  set(EXTRA_FLAGS
+      ${EXTRA_FLAGS}
+      -DCOMPILER_RT_BUILD_PROFILE_ROCM=1)
+endif()
+
 if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx")
   append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding EXTRA_FLAGS)
   append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib EXTRA_FLAGS)
@@ -168,13 +211,24 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES 
"amdgcn|nvptx")
 endif()
 
 if(MSVC)
-  # profile historically has only been supported with the static runtime
-  # on windows
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+  # profile historically used the static CRT (/MT). When we merge 
RTInterception and
+  # RTSanitizerCommon (same object libs as clang_rt.cfi on ELF), those targets 
are
+  # built with MultiThreadedDLL (/MD) — see interception/CMakeLists.txt and
+  # sanitizer_common/CMakeLists.txt. Mixing /MD objects into a /MT 
libclang_rt.profile
+  # yields LNK2019 (__imp__stricmp from interception_win.cpp) and LNK4098 in 
Profile-*.
+  if(PROFILE_HAS_HIP_INTERCEPTOR)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
+  else()
+    set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+  endif()
 endif()
 
 # We don't use the C++ Standard Library here, so avoid including it by mistake.
 append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
+# C++ profile sources (e.g. InstrProfilingPlatformROCm.cpp) must not emit 
exception
+# personality symbols: host libclang_rt.profile.a is linked from C code and 
from C++
+# tests that do not pull in __gxx_personality_v0 (Profile-* / premerge).
+append_list_if(COMPILER_RT_HAS_FNO_EXCEPTIONS_FLAG -fno-exceptions EXTRA_FLAGS)
 # XRay uses C++ standard library headers.
 string(REGEX REPLACE "-?-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS 
"${CMAKE_CXX_FLAGS}")
 
@@ -200,6 +254,7 @@ if(APPLE)
     STATIC
     OS ${PROFILE_SUPPORTED_OS}
     ARCHS ${PROFILE_SUPPORTED_ARCH}
+    OBJECT_LIBS ${PROFILE_OBJECT_LIBS}
     CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
@@ -209,6 +264,7 @@ else()
   add_compiler_rt_runtime(clang_rt.profile
     STATIC
     ARCHS ${PROFILE_SUPPORTED_ARCH}
+    OBJECT_LIBS ${PROFILE_OBJECT_LIBS}
     CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c 
b/compiler-rt/lib/profile/InstrProfilingFile.c
index 71127b05aafb8..9ea5a2638fac9 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -41,6 +41,23 @@
 #include "InstrProfilingPort.h"
 #include "InstrProfilingUtil.h"
 
+/* Weak so non-HIP programs do not force InstrProfilingPlatformROCm.o (and its
+ * transitive sanitizer_common / interception dependencies) into the host link
+ * out of libclang_rt.profile.a. HIP programs emit strong references to other
+ * ROCm-runtime symbols (e.g. __llvm_profile_offload_register_shadow_variable)
+ * that pull in the strong definition.
+ * No COMPILER_RT_VISIBILITY: a hidden weak-undefined symbol is non-preemptible
+ * and the address test at the call site would fold to true.
+ * Windows: __declspec(selectany) is data-only, and the ROCm interceptor path
+ * is not used there, so keep the original strong extern. */
+#if COMPILER_RT_BUILD_PROFILE_ROCM
+#if defined(_WIN32)
+extern int __llvm_profile_hip_collect_device_data(void);
+#else
+__attribute__((weak)) int __llvm_profile_hip_collect_device_data(void);
+#endif
+#endif
+
 /* From where is profile name specified.
  * The order the enumerators define their
  * precedence. Re-order them may lead to
@@ -1198,6 +1215,19 @@ int __llvm_profile_write_file(void) {
   if (rc)
     PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno));
 
+  /* On non-Windows the declaration is weak: only invoked when
+   * InstrProfilingPlatformROCm.o is in the link, which happens when the 
program
+   * references other ROCm-runtime symbols (HIP-with-PGO). Warning on failure 
is
+   * handled inside the callee. */
+#if COMPILER_RT_BUILD_PROFILE_ROCM
+#if defined(_WIN32)
+  (void)__llvm_profile_hip_collect_device_data();
+#else
+  if (&__llvm_profile_hip_collect_device_data)
+    (void)__llvm_profile_hip_collect_device_data();
+#endif
+#endif
+
   // Restore SIGKILL.
   if (PDeathSig == 1)
     lprofRestoreSigKill();
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp 
b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
new file mode 100644
index 0000000000000..ee00c572e3a42
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
@@ -0,0 +1,897 @@
+//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+extern "C" {
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingPort.h"
+}
+
+#include "interception/interception.h"
+// C librar...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/201499
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to