[clang] [compiler-rt] [llvm] Reland "[PGO][AMDGPU] Add basic HIP offload PGO support (#177665)" (PR #201499)

Yaxun Liu via cfe-commits Wed, 03 Jun 2026 20:51:08 -0700

https://github.com/yxsamliu created 
https://github.com/llvm/llvm-project/pull/201499


Reland of #177665, reverted in #201416 along with its merged follow-ups
#200101, #200127, #200111, and #200859.

The revert was triggered because the merged change still broke Windows profiling
builds that link `clang_rt.profile` with the static CRT (`/MT`): the ROCm
collection runtime pulls in `RTInterception` + `sanitizer_common`, which are
built `/MD`, forcing the profile archive to `/MD` and breaking static-CRT
consumers (`LNK2019 __imp_getpid` …).

This reland restores the reverted commits and fixes that last regression:
`COMPILER_RT_BUILD_PROFILE_ROCM` now defaults off on Windows (already off on
Apple), with the interceptor object-lib merge and the `Profile-*` test `/MD` CRT
model gated on it. The Windows archive stays `/MT`, so static-CRT consumers link
again. Windows device PGO is not wired up end-to-end yet, so nothing is lost;
opt in with `-DCOMPILER_RT_BUILD_PROFILE_ROCM=ON` to get the `/MD` archive.


>From a7249a2b2fe192f3fe3ea767ab1f7a665df047fd Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <[email protected]>
Date: Wed, 3 Jun 2026 22:18:59 -0400
Subject: [PATCH 1/2] Revert "Revert "[PGO][AMDGPU] Add basic HIP offload PGO
 support (#177665)" (#201416)"

This reverts commit 6cfa1a01a0f737ed1d54963810057fb1bd67a274.
---
 clang/lib/CodeGen/CGCUDANV.cpp                | 152 +++
 .../test/CodeGenHIP/offload-pgo-sections.hip  |  50 +
 compiler-rt/CMakeLists.txt                    |   9 +
 compiler-rt/lib/profile/CMakeLists.txt        |  59 +-
 compiler-rt/lib/profile/InstrProfilingFile.c  |  30 +
 .../profile/InstrProfilingPlatformROCm.cpp    | 897 ++++++++++++++++++
 llvm/include/llvm/IR/RuntimeLibcalls.td       |   9 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |   5 +
 .../Instrumentation/InstrProfiling.cpp        | 109 ++-
 .../Instrumentation/PGOInstrumentation.cpp    |   3 +-
 .../InstrProfiling/amdgpu-instrumentation.ll  |  32 +
 .../InstrProfiling/amdgpu-profc-arrays.ll     |  26 +
 .../InstrProfiling/gpu-weak.ll                |  36 +
 .../amdgpu-disable-value-profiling.ll         |  22 +
 14 files changed, 1407 insertions(+), 32 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/offload-pgo-sections.hip
 create mode 100644 compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
 create mode 100644 
llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
 create mode 100644 
llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll
 create mode 100644 
llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 259b6c040706b..65f398af7902b 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
+  /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
+  /// emitted only for HIP host compiles when PGO is on. Registered via
+  /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
+  /// can locate the device-side table by name.
+  llvm::GlobalVariable *OffloadProfShadow = nullptr;
   /// Whether we generate relocatable device code.
   bool RelocatableDeviceCode;
   /// Mangle context for device.
@@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   void transformManagedVars();
   /// Create offloading entries to register globals in RDC mode.
   void createOffloadingEntries();
+  /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
+  /// On the device side it is the populated 7-pointer section-bounds table.
+  /// On the host side it is a placeholder void* shadow stored in
+  /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
+  /// or createOffloadingEntries (RDC) so the runtime can locate the
+  /// device-side table by name.
+  void emitOffloadProfilingSections();
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
@@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     }
   }
 
+  // Register the per-TU offload-profiling shadow so the host runtime can
+  // locate the matching device-side __llvm_profile_sections_<CUID>. We
+  // emit both __hipRegisterVar (so the HIP runtime can map the host
+  // shadow to the device symbol) and
+  // __llvm_profile_offload_register_shadow_variable (so the profile
+  // runtime adds the shadow to its drain list).
+  if (OffloadProfShadow) {
+    llvm::Constant *Name =
+        makeConstantString(std::string(OffloadProfShadow->getName()));
+    llvm::Value *RegisterVarArgs[] = {
+        &GpuBinaryHandlePtr,
+        OffloadProfShadow,
+        Name,
+        Name,
+        llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+        llvm::ConstantInt::get(VarSizeTy, 
CGM.getDataLayout().getPointerSize()),
+        llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+        llvm::ConstantInt::get(IntTy, 0)};
+    Builder.CreateCall(RegisterVar, RegisterVarArgs);
+
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
+  }
+
   Builder.CreateRetVoid();
   return RegisterKernelsFunc;
 }
@@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() {
           I.Flags.getSurfTexType());
     }
   }
+
+  // Register the per-TU offload-profiling shadow. The offloading entry
+  // makes the linker-wrapper emit the host __hipRegisterVar call in the
+  // combined ctor. Separately emit a per-TU ctor that registers the
+  // shadow with the profile runtime's drain list.
+  if (OffloadProfShadow) {
+    llvm::offloading::emitOffloadingEntry(
+        M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
+        CGM.getDataLayout().getPointerSize(),
+        llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+
+    llvm::LLVMContext &Ctx = M.getContext();
+    auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    auto *CtorFn = llvm::Function::Create(
+        llvm::FunctionType::get(VoidTy, false),
+        llvm::GlobalValue::InternalLinkage,
+        "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), 
&M);
+    auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
+    llvm::IRBuilder<> B(Entry);
+    B.CreateCall(RegisterShadow, {OffloadProfShadow});
+    B.CreateRetVoid();
+    llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
+  }
+}
+
+// For HIP host+device compiles with PGO enabled, emit the per-TU global
+// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding
+// section start/stop bounds for the names/counters/data sections plus the
+// raw-version variable. Host side: an opaque void* shadow whose only
+// purpose is to give the host-runtime a registered symbol name to look up
+// via hipGetSymbolAddress; the actual device-side data lives in the
+// matching device-side global.
+void CGNVCUDARuntime::emitOffloadProfilingSections() {
+  if (!CGM.getLangOpts().HIP)
+    return;
+  if (!CGM.getCodeGenOpts().hasProfileInstr())
+    return;
+
+  StringRef CUIDHash = CGM.getContext().getCUIDHash();
+  if (CUIDHash.empty())
+    return;
+
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = M.getContext();
+  std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
+
+  // If the global already exists (e.g. another TU was merged in), don't
+  // duplicate it.
+  if (M.getNamedValue(Name))
+    return;
+
+  if (CGM.getLangOpts().CUDAIsDevice) {
+    // Device side: emit the populated struct. Section start/stop symbols
+    // are linker-defined (ELF auto-generates __start_/__stop_ for any
+    // section whose name is a valid C identifier; AMDGPU is ELF).
+    unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+    auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
+    auto getOrDeclare = [&](StringRef SymName) {
+      if (auto *GV = M.getNamedGlobal(SymName))
+        return GV;
+      auto *GV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+      GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+      return GV;
+    };
+    auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
+    if (!VersionGV) {
+      VersionGV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
+          "__llvm_profile_raw_version",
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+    }
+
+    auto *StructTy = llvm::StructType::get(
+        Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
+    llvm::Constant *Fields[] = {
+        getOrDeclare("__start___llvm_prf_names"),
+        getOrDeclare("__stop___llvm_prf_names"),
+        getOrDeclare("__start___llvm_prf_cnts"),
+        getOrDeclare("__stop___llvm_prf_cnts"),
+        getOrDeclare("__start___llvm_prf_data"),
+        getOrDeclare("__stop___llvm_prf_data"),
+        VersionGV,
+    };
+    auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
+    auto *GV = new llvm::GlobalVariable(
+        M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+        Init, Name, /*InsertBefore=*/nullptr, 
llvm::GlobalValue::NotThreadLocal,
+        GlobalAS);
+    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
+    CGM.addCompilerUsedGlobal(GV);
+    return;
+  }
+
+  // Host side: emit an opaque void* shadow. Layout doesn't matter — the
+  // runtime locates it by name via hipGetSymbolAddress and treats it as
+  // the address of the device-side struct. Registration with the HIP
+  // runtime is added by makeRegisterGlobalsFn (non-RDC) or
+  // createOffloadingEntries (RDC).
+  auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+  OffloadProfShadow = new llvm::GlobalVariable(
+      M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+      llvm::ConstantPointerNull::get(PtrTy), Name);
+  CGM.addCompilerUsedGlobal(OffloadProfShadow);
 }
 
 // Returns module constructor to be added.
 llvm::Function *CGNVCUDARuntime::finalizeModule() {
   transformManagedVars();
+  emitOffloadProfilingSections();
   if (CGM.getLangOpts().CUDAIsDevice) {
     // Mark ODR-used device variables as compiler used to prevent it from being
     // eliminated by optimization. This is necessary for device variables
diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip 
b/clang/test/CodeGenHIP/offload-pgo-sections.hip
new file mode 100644
index 0000000000000..17c6fe7b9e609
--- /dev/null
+++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip
@@ -0,0 +1,50 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: x86-registered-target
+
+// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global
+// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct
+// in addrspace(1). Host compile: void* shadow registered with the HIP
+// runtime and with the profile runtime's drain list.
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=HOST %s
+
+// Guard: no PGO -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+// Guard: no CUID -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ int helper(int x) { return x + 1; }
+__global__ void kernel(int *p) { *p = helper(*p); }
+
+// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64
+// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) 
constant 
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
+// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+
+// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+// HOST: define internal void @__hip_register_globals
+// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]],
+// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+
+// NONE-NOT: __llvm_profile_sections_
+// NONE-NOT: __llvm_profile_offload_register_shadow_variable
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index e88321d822f84..39034fd9ba67d 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -322,6 +322,15 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt 
atomic instead of libatom
 
 option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" 
OFF)
 
+set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON)
+if(APPLE)
+  set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF)
+endif()
+option(COMPILER_RT_BUILD_PROFILE_ROCM
+  "Build the host-side ROCm/HIP device profile collection runtime"
+  ${DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM})
+mark_as_advanced(COMPILER_RT_BUILD_PROFILE_ROCM)
+
 include(config-ix)
 
 #================================
diff --git a/compiler-rt/lib/profile/CMakeLists.txt 
b/compiler-rt/lib/profile/CMakeLists.txt
index 8d9a773412a22..77db2477bb7c6 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -93,6 +93,9 @@ if (NOT COMPILER_RT_PROFILE_BAREMETAL)
     InstrProfilingUtil.c
     InstrProfilingValue.c
     )
+  if(COMPILER_RT_BUILD_PROFILE_ROCM)
+    list(APPEND PROFILE_SOURCES InstrProfilingPlatformROCm.cpp)
+  endif()
 endif()
 
 set(PROFILE_HEADERS
@@ -155,6 +158,43 @@ if(COMPILER_RT_PROFILE_BAREMETAL)
      -DCOMPILER_RT_PROFILE_BAREMETAL=1)
 endif()
 
+# The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in
+# RTInterception + sanitizer_common object libs. Those targets are only created
+# when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is 
enabled
+# (see lib/CMakeLists.txt). In a profile-only build the targets do not exist;
+# skip both the object-lib merge and the ROCm source file so the static archive
+# remains self-contained.
+set(PROFILE_OBJECT_LIBS)
+set(PROFILE_HAS_HIP_INTERCEPTOR FALSE)
+if(COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL
+   AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH}
+   AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH}
+   AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH})
+  # RTInterception references __sanitizer_internal_{memcpy,memset,memmove} and 
other
+  # sanitizer_common symbols; merge the same object libs as clang_rt.cfi 
(without
+  # coverage/symbolizer) so -fprofile-instr-generate links stay self-contained.
+  list(APPEND PROFILE_OBJECT_LIBS
+    RTInterception
+    RTSanitizerCommon
+    RTSanitizerCommonLibc)
+  set(PROFILE_HAS_HIP_INTERCEPTOR TRUE)
+endif()
+
+if(NOT PROFILE_HAS_HIP_INTERCEPTOR)
+  list(REMOVE_ITEM PROFILE_SOURCES InstrProfilingPlatformROCm.cpp)
+endif()
+
+# Only advertise the ROCm interceptor to InstrProfilingFile.c when its
+# definition (InstrProfilingPlatformROCm.cpp) is actually compiled into the
+# archive. Otherwise InstrProfilingFile.c references
+# __llvm_profile_hip_collect_device_data with no definition; on COFF/Windows
+# there is no weak-undefined fallback, so the link fails (see PR #200111).
+if(COMPILER_RT_BUILD_PROFILE_ROCM AND PROFILE_HAS_HIP_INTERCEPTOR)
+  set(EXTRA_FLAGS
+      ${EXTRA_FLAGS}
+      -DCOMPILER_RT_BUILD_PROFILE_ROCM=1)
+endif()
+
 if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx")
   append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding EXTRA_FLAGS)
   append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib EXTRA_FLAGS)
@@ -168,13 +208,24 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES 
"amdgcn|nvptx")
 endif()
 
 if(MSVC)
-  # profile historically has only been supported with the static runtime
-  # on windows
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+  # profile historically used the static CRT (/MT). When we merge 
RTInterception and
+  # RTSanitizerCommon (same object libs as clang_rt.cfi on ELF), those targets 
are
+  # built with MultiThreadedDLL (/MD) — see interception/CMakeLists.txt and
+  # sanitizer_common/CMakeLists.txt. Mixing /MD objects into a /MT 
libclang_rt.profile
+  # yields LNK2019 (__imp__stricmp from interception_win.cpp) and LNK4098 in 
Profile-*.
+  if(PROFILE_HAS_HIP_INTERCEPTOR)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
+  else()
+    set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+  endif()
 endif()
 
 # We don't use the C++ Standard Library here, so avoid including it by mistake.
 append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
+# C++ profile sources (e.g. InstrProfilingPlatformROCm.cpp) must not emit 
exception
+# personality symbols: host libclang_rt.profile.a is linked from C code and 
from C++
+# tests that do not pull in __gxx_personality_v0 (Profile-* / premerge).
+append_list_if(COMPILER_RT_HAS_FNO_EXCEPTIONS_FLAG -fno-exceptions EXTRA_FLAGS)
 # XRay uses C++ standard library headers.
 string(REGEX REPLACE "-?-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS 
"${CMAKE_CXX_FLAGS}")
 
@@ -200,6 +251,7 @@ if(APPLE)
     STATIC
     OS ${PROFILE_SUPPORTED_OS}
     ARCHS ${PROFILE_SUPPORTED_ARCH}
+    OBJECT_LIBS ${PROFILE_OBJECT_LIBS}
     CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
@@ -209,6 +261,7 @@ else()
   add_compiler_rt_runtime(clang_rt.profile
     STATIC
     ARCHS ${PROFILE_SUPPORTED_ARCH}
+    OBJECT_LIBS ${PROFILE_OBJECT_LIBS}
     CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c 
b/compiler-rt/lib/profile/InstrProfilingFile.c
index 71127b05aafb8..9ea5a2638fac9 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -41,6 +41,23 @@
 #include "InstrProfilingPort.h"
 #include "InstrProfilingUtil.h"
 
+/* Weak so non-HIP programs do not force InstrProfilingPlatformROCm.o (and its
+ * transitive sanitizer_common / interception dependencies) into the host link
+ * out of libclang_rt.profile.a. HIP programs emit strong references to other
+ * ROCm-runtime symbols (e.g. __llvm_profile_offload_register_shadow_variable)
+ * that pull in the strong definition.
+ * No COMPILER_RT_VISIBILITY: a hidden weak-undefined symbol is non-preemptible
+ * and the address test at the call site would fold to true.
+ * Windows: __declspec(selectany) is data-only, and the ROCm interceptor path
+ * is not used there, so keep the original strong extern. */
+#if COMPILER_RT_BUILD_PROFILE_ROCM
+#if defined(_WIN32)
+extern int __llvm_profile_hip_collect_device_data(void);
+#else
+__attribute__((weak)) int __llvm_profile_hip_collect_device_data(void);
+#endif
+#endif
+
 /* From where is profile name specified.
  * The order the enumerators define their
  * precedence. Re-order them may lead to
@@ -1198,6 +1215,19 @@ int __llvm_profile_write_file(void) {
   if (rc)
     PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno));
 
+  /* On non-Windows the declaration is weak: only invoked when
+   * InstrProfilingPlatformROCm.o is in the link, which happens when the 
program
+   * references other ROCm-runtime symbols (HIP-with-PGO). Warning on failure 
is
+   * handled inside the callee. */
+#if COMPILER_RT_BUILD_PROFILE_ROCM
+#if defined(_WIN32)
+  (void)__llvm_profile_hip_collect_device_data();
+#else
+  if (&__llvm_profile_hip_collect_device_data)
+    (void)__llvm_profile_hip_collect_device_data();
+#endif
+#endif
+
   // Restore SIGKILL.
   if (PDeathSig == 1)
     lprofRestoreSigKill();
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp 
b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
new file mode 100644
index 0000000000000..ee00c572e3a42
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
@@ -0,0 +1,897 @@
+//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+extern "C" {
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingPort.h"
+}
+
+#include "interception/interception.h"
+// C library headers (not <cstdio> etc.): clang_rt.profile is built with
+// -nostdinc++ and avoids the C++ standard library (see 
profile/CMakeLists.txt).
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#include <pthread.h>
+#endif
+
+/* Serialize one-time HIP loader resolution and DynamicModules mutations.
+ * Inline to avoid a sanitizer_common dependency. */
+#ifdef _WIN32
+static INIT_ONCE HipLoadedOnce = INIT_ONCE_STATIC_INIT;
+static CRITICAL_SECTION DynamicModulesLock;
+static INIT_ONCE DynamicModulesLockInit = INIT_ONCE_STATIC_INIT;
+static BOOL CALLBACK initDynamicModulesLockCb(PINIT_ONCE, PVOID, PVOID *) {
+  InitializeCriticalSection(&DynamicModulesLock);
+  return TRUE;
+}
+static void lockDynamicModules(void) {
+  InitOnceExecuteOnce(&DynamicModulesLockInit, initDynamicModulesLockCb, NULL,
+                      NULL);
+  EnterCriticalSection(&DynamicModulesLock);
+}
+static void unlockDynamicModules(void) {
+  LeaveCriticalSection(&DynamicModulesLock);
+}
+#else
+static pthread_once_t HipLoadedOnce = PTHREAD_ONCE_INIT;
+static pthread_mutex_t DynamicModulesLock = PTHREAD_MUTEX_INITIALIZER;
+static void lockDynamicModules(void) {
+  pthread_mutex_lock(&DynamicModulesLock);
+}
+static void unlockDynamicModules(void) {
+  pthread_mutex_unlock(&DynamicModulesLock);
+}
+#endif
+
+static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
+                                   const char *Target);
+
+static int isVerboseMode() {
+  static int IsVerbose = -1;
+  if (IsVerbose == -1)
+    IsVerbose = getenv("LLVM_PROFILE_VERBOSE") != nullptr;
+  return IsVerbose;
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Dynamic loading of HIP runtime symbols                                   */
+/* -------------------------------------------------------------------------- 
*/
+
+typedef int (*hipGetSymbolAddressTy)(void **, const void *);
+typedef int (*hipMemcpyTy)(void *, const void *, size_t, int);
+typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *);
+typedef int (*hipGetDeviceCountTy)(int *);
+typedef int (*hipGetDeviceTy)(int *);
+typedef int (*hipSetDeviceTy)(int);
+
+/* Minimal hipDeviceProp_t (HIP 6.x R0600): only gcnArchName at offset 1160
+ * is read. Padded to 4096 to tolerate ABI growth. */
+typedef struct {
+  char padding[1160];
+  char gcnArchName[256];
+  char tail_padding[2680];
+} HipDevicePropMinimal;
+typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int);
+
+static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr;
+static hipMemcpyTy pHipMemcpy = nullptr;
+static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr;
+static hipGetDeviceCountTy pHipGetDeviceCount = nullptr;
+static hipGetDeviceTy pHipGetDevice = nullptr;
+static hipSetDeviceTy pHipSetDevice = nullptr;
+static hipGetDevicePropertiesTy pHipGetDeviceProperties = nullptr;
+
+static int NumDevices = 0;
+/* 256 matches hipDeviceProp_t::gcnArchName, the source field width. */
+static char (*DeviceArchNames)[256] = nullptr;
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Device-to-host copies                                                     
*/
+/*  Keep HIP-only to avoid an HSA dependency.                                 
*/
+/* -------------------------------------------------------------------------- 
*/
+
+static void doEnsureHipLoaded(void) {
+  if (!__interception::DynamicLoaderAvailable()) {
+    if (isVerboseMode())
+      PROF_NOTE("%s", "Dynamic library loading not available - "
+                      "HIP profiling disabled\n");
+    return;
+  }
+
+#ifdef _WIN32
+  static const char HipLibName[] = "amdhip64.dll";
+#else
+  static const char HipLibName[] = "libamdhip64.so";
+#endif
+
+  void *Handle = __interception::OpenLibrary(HipLibName);
+  if (!Handle)
+    return;
+
+  pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol(
+      Handle, "hipGetSymbolAddress");
+  pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy");
+  pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol(
+      Handle, "hipModuleGetGlobal");
+  pHipGetDeviceCount = (hipGetDeviceCountTy)__interception::LookupSymbol(
+      Handle, "hipGetDeviceCount");
+  pHipGetDevice =
+      (hipGetDeviceTy)__interception::LookupSymbol(Handle, "hipGetDevice");
+  pHipSetDevice =
+      (hipSetDeviceTy)__interception::LookupSymbol(Handle, "hipSetDevice");
+  pHipGetDeviceProperties =
+      (hipGetDevicePropertiesTy)__interception::LookupSymbol(
+          Handle, "hipGetDevicePropertiesR0600");
+  if (!pHipGetDeviceProperties)
+    pHipGetDeviceProperties =
+        (hipGetDevicePropertiesTy)__interception::LookupSymbol(
+            Handle, "hipGetDeviceProperties");
+
+  if (pHipGetDeviceCount && pHipGetDeviceProperties) {
+    int Count = 0;
+    if (pHipGetDeviceCount(&Count) == 0 && Count > 0) {
+      DeviceArchNames = (char (*)[256])calloc(Count, sizeof(*DeviceArchNames));
+      if (!DeviceArchNames) {
+        PROF_ERR("%s\n", "failed to allocate device arch name table");
+        return;
+      }
+      HipDevicePropMinimal Prop;
+      for (int i = 0; i < Count; ++i) {
+        __builtin_memset(&Prop, 0, sizeof(Prop));
+        if (pHipGetDeviceProperties(&Prop, i) == 0) {
+          strncpy(DeviceArchNames[i], Prop.gcnArchName,
+                  sizeof(DeviceArchNames[i]) - 1);
+          DeviceArchNames[i][sizeof(DeviceArchNames[i]) - 1] = '\0';
+          if (isVerboseMode())
+            PROF_NOTE("Device %d arch: %s\n", i, DeviceArchNames[i]);
+        }
+      }
+      NumDevices = Count;
+    }
+  }
+}
+
+#ifdef _WIN32
+static BOOL CALLBACK ensureHipLoadedCb(PINIT_ONCE, PVOID, PVOID *) {
+  doEnsureHipLoaded();
+  return TRUE;
+}
+#endif
+
+static void ensureHipLoaded(void) {
+#ifdef _WIN32
+  InitOnceExecuteOnce(&HipLoadedOnce, ensureHipLoadedCb, NULL, NULL);
+#else
+  pthread_once(&HipLoadedOnce, doEnsureHipLoaded);
+#endif
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Public wrappers that forward to the loaded HIP symbols                   */
+/* -------------------------------------------------------------------------- 
*/
+
+static int hipGetSymbolAddress(void **devPtr, const void *symbol) {
+  ensureHipLoaded();
+  return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1;
+}
+
+static int hipMemcpy(void *dest, const void *src, size_t len,
+                     int kind /*2=DToH*/) {
+  ensureHipLoaded();
+  return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1;
+}
+
+/* Device section symbols must be registered with CLR first; otherwise
+ * hipMemcpy may take a CPU path and crash. */
+static int memcpyDeviceToHost(void *Dst, const void *Src, size_t Size) {
+  return hipMemcpy(Dst, Src, Size, 2 /* DToH */);
+}
+
+static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module,
+                              const char *Name) {
+  ensureHipLoaded();
+  return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name)
+                             : -1;
+}
+
+static int hipGetDevice(int *DeviceId) {
+  ensureHipLoaded();
+  return pHipGetDevice ? pHipGetDevice(DeviceId) : -1;
+}
+
+static int hipSetDevice(int DeviceId) {
+  ensureHipLoaded();
+  return pHipSetDevice ? pHipSetDevice(DeviceId) : -1;
+}
+
+static const char *getDeviceArchName(int DeviceId) {
+  if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0])
+    return "amdgpu";
+  return DeviceArchNames[DeviceId];
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Dynamic module tracking                                                   
*/
+/* -------------------------------------------------------------------------- 
*/
+
+/* Per-TU profile entry inside a dynamic module.
+ * A single dynamic module may contain multiple TUs (e.g. -fgpu-rdc). */
+typedef struct {
+  void *DeviceVar; /* device address of __llvm_profile_sections_<CUID> */
+  int Processed;   /* 0 = not yet collected, 1 = data already copied   */
+} OffloadDynamicTUInfo;
+
+/* One entry per hipModuleLoad call. */
+typedef struct {
+  void *ModulePtr;           /* hipModule_t handle                        */
+  OffloadDynamicTUInfo *TUs; /* array of per-TU entries                 */
+  int NumTUs;
+  int CapTUs;
+} OffloadDynamicModuleInfo;
+
+static OffloadDynamicModuleInfo *DynamicModules = nullptr;
+static int NumDynamicModules = 0;
+static int CapDynamicModules = 0;
+
+/* -------------------------------------------------------------------------- 
*/
+/*  ELF symbol enumeration (manual parse: compiler-rt cannot link LLVM Support)
+ */
+/* -------------------------------------------------------------------------- 
*/
+
+#if __has_include(<elf.h>)
+#include <elf.h>
+
+/* Callback invoked for every matching symbol name found in the ELF image.
+ * Return 0 to continue iteration, non-zero to stop. */
+typedef int (*SymbolCallback)(const char *Name, void *UserData);
+
+/* If Image is a clang offload bundle, return a pointer to the first embedded
+ * ELF. Returns Image if not a bundle, nullptr if a bundle holds no ELF. */
+static const void *unwrapOffloadBundle(const void *Image) {
+  static const char BundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
+  if (memcmp(Image, BundleMagic, sizeof(BundleMagic) - 1) != 0)
+    return Image; /* Not a bundle, return as-is. */
+
+  const char *Buf = (const char *)Image;
+  uint64_t NumEntries;
+  __builtin_memcpy(&NumEntries, Buf + sizeof(BundleMagic) - 1,
+                   sizeof(uint64_t));
+
+  /* Walk the entry table (starts at offset 32). */
+  const char *Cursor = Buf + 32;
+  for (uint64_t I = 0; I < NumEntries; ++I) {
+    uint64_t EntryOffset, EntrySize, IDSize;
+    __builtin_memcpy(&EntryOffset, Cursor, sizeof(EntryOffset));
+    Cursor += sizeof(EntryOffset);
+    __builtin_memcpy(&EntrySize, Cursor, sizeof(EntrySize));
+    Cursor += sizeof(EntrySize);
+    __builtin_memcpy(&IDSize, Cursor, sizeof(IDSize));
+    Cursor += sizeof(IDSize);
+    Cursor += IDSize; /* skip entry ID */
+
+    if (EntrySize >= sizeof(Elf64_Ehdr)) {
+      const Elf64_Ehdr *E = (const Elf64_Ehdr *)(Buf + EntryOffset);
+      if (E->e_ident[EI_MAG0] == ELFMAG0 && E->e_ident[EI_MAG1] == ELFMAG1 &&
+          E->e_ident[EI_MAG2] == ELFMAG2 && E->e_ident[EI_MAG3] == ELFMAG3) {
+        return (const void *)(Buf + EntryOffset);
+      }
+    }
+  }
+
+  PROF_WARN("%s", "offload bundle contains no valid ELF entries\n");
+  return nullptr;
+}
+
+/* Invoke CB for every global symbol in Image (an AMDGPU ELF or offload bundle)
+ * whose name starts with PREFIX. Image may be null. */
+static void enumerateElfSymbols(const void *Image, const char *Prefix,
+                                SymbolCallback CB, void *UserData) {
+  if (!Image)
+    return;
+
+  Image = unwrapOffloadBundle(Image);
+  if (!Image)
+    return;
+
+  const Elf64_Ehdr *Ehdr = (const Elf64_Ehdr *)Image;
+  if (Ehdr->e_ident[EI_MAG0] != ELFMAG0 || Ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+      Ehdr->e_ident[EI_MAG2] != ELFMAG2 || Ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+    if (isVerboseMode())
+      PROF_NOTE("%s", "Image is not a valid ELF, skipping enumeration\n");
+    return;
+  }
+
+  size_t PrefixLen = strlen(Prefix);
+  const char *Base = (const char *)Image;
+  const Elf64_Shdr *Shdrs = (const Elf64_Shdr *)(Base + Ehdr->e_shoff);
+
+  for (int i = 0; i < Ehdr->e_shnum; ++i) {
+    if (Shdrs[i].sh_type != SHT_SYMTAB)
+      continue;
+
+    const Elf64_Sym *Syms = (const Elf64_Sym *)(Base + Shdrs[i].sh_offset);
+    int NumSyms = Shdrs[i].sh_size / sizeof(Elf64_Sym);
+    /* String table is the section referenced by sh_link. */
+    const char *StrTab = Base + Shdrs[Shdrs[i].sh_link].sh_offset;
+
+    for (int j = 0; j < NumSyms; ++j) {
+      if (Syms[j].st_name == 0)
+        continue;
+      const char *Name = StrTab + Syms[j].st_name;
+      if (strncmp(Name, Prefix, PrefixLen) == 0) {
+        if (CB(Name, UserData))
+          return;
+      }
+    }
+  }
+}
+
+/* State passed through the enumeration callback. */
+typedef struct {
+  void *Module; /* hipModule_t */
+  OffloadDynamicModuleInfo *ModInfo;
+} EnumState;
+
+/* Register one __llvm_profile_sections_<CUID> symbol on the module entry.
+ * hipModuleGetGlobal also registers the device address with CLR so hipMemcpy
+ * can copy from it later. */
+static int registerPrfSymbol(const char *Name, void *UserData) {
+  EnumState *S = (EnumState *)UserData;
+  OffloadDynamicModuleInfo *MI = S->ModInfo;
+
+  /* The symbol is the per-TU sections struct itself, not a pointer
+   * indirection, so this address is the hipMemcpy source. */
+  void *DeviceVar = nullptr;
+  size_t Bytes = 0;
+  if (hipModuleGetGlobal(&DeviceVar, &Bytes, S->Module, Name) != 0) {
+    PROF_WARN("failed to get symbol %s for module %p\n", Name, S->Module);
+    return 0; /* continue */
+  }
+
+  if (MI->NumTUs >= MI->CapTUs) {
+    int NewCap = MI->CapTUs ? MI->CapTUs * 2 : 4;
+    OffloadDynamicTUInfo *New = (OffloadDynamicTUInfo *)realloc(
+        MI->TUs, NewCap * sizeof(OffloadDynamicTUInfo));
+    if (!New) {
+      PROF_ERR("%s\n", "failed to grow TU array");
+      return 0;
+    }
+    MI->TUs = New;
+    MI->CapTUs = NewCap;
+  }
+  OffloadDynamicTUInfo *TU = &MI->TUs[MI->NumTUs++];
+  TU->DeviceVar = DeviceVar;
+  TU->Processed = 0;
+
+  (void)Name;
+  return 0; /* continue enumeration */
+}
+
+#endif /* __has_include(<elf.h>) */
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Registration / un-registration helpers                                   */
+/* -------------------------------------------------------------------------- 
*/
+
+extern "C" void
+__llvm_profile_offload_register_dynamic_module(int ModuleLoadRc, void **Ptr,
+                                               const void *Image) {
+  if (ModuleLoadRc)
+    return;
+
+  lockDynamicModules();
+
+  if (isVerboseMode())
+    PROF_NOTE("Registering loaded module %d: rc=%d, module=%p, image=%p\n",
+              NumDynamicModules, ModuleLoadRc, *Ptr, Image);
+
+  if (NumDynamicModules >= CapDynamicModules) {
+    int NewCap = CapDynamicModules ? CapDynamicModules * 2 : 64;
+    OffloadDynamicModuleInfo *New = (OffloadDynamicModuleInfo *)realloc(
+        DynamicModules, NewCap * sizeof(OffloadDynamicModuleInfo));
+    if (!New) {
+      unlockDynamicModules();
+      return;
+    }
+    DynamicModules = New;
+    CapDynamicModules = NewCap;
+  }
+
+  OffloadDynamicModuleInfo *MI = &DynamicModules[NumDynamicModules++];
+  MI->ModulePtr = *Ptr;
+  MI->TUs = nullptr;
+  MI->NumTUs = 0;
+  MI->CapTUs = 0;
+
+  /* Dynamic-module profiling needs ELF parsing for symbol enumeration. */
+#if __has_include(<elf.h>)
+  EnumState State = {*Ptr, MI};
+  enumerateElfSymbols(Image, "__llvm_profile_sections_", registerPrfSymbol,
+                      &State);
+#else
+  (void)Image;
+  if (isVerboseMode())
+    PROF_NOTE("%s",
+              "Dynamic module profiling not supported on this platform\n");
+#endif
+
+  if (MI->NumTUs == 0) {
+    PROF_WARN("no __llvm_profile_sections_* symbols found in module %p\n",
+              *Ptr);
+  } else if (isVerboseMode()) {
+    PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs);
+  }
+
+  unlockDynamicModules();
+}
+
+extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
+  lockDynamicModules();
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *MI = &DynamicModules[i];
+
+    /* HIP recycles hipModule_t addresses; drained slots are cleared so a
+     * recycled handle finds the new slot, not the dead one. */
+    if (MI->ModulePtr != Ptr)
+      continue;
+
+    if (isVerboseMode())
+      PROF_NOTE("Unregistering module %p (%d TUs)\n", MI->ModulePtr,
+                MI->NumTUs);
+
+    static int NextTUIndex = 0;
+    for (int t = 0; t < MI->NumTUs; ++t) {
+      OffloadDynamicTUInfo *TU = &MI->TUs[t];
+      if (TU->Processed) {
+        if (isVerboseMode())
+          PROF_NOTE("Module %p TU %d already processed, skipping\n", Ptr, t);
+        continue;
+      }
+      int TUIndex = __atomic_fetch_add(&NextTUIndex, 1, __ATOMIC_RELAXED);
+      if (TU->DeviceVar) {
+        int CurDev = 0;
+        hipGetDevice(&CurDev);
+        const char *ArchName = getDeviceArchName(CurDev);
+        /* Encode TUIndex in Target so each drain writes a distinct profraw;
+         * otherwise back-to-back drains overwrite the same file. */
+        char TargetWithTU[64];
+        snprintf(TargetWithTU, sizeof(TargetWithTU), "%s.%d", ArchName,
+                 TUIndex);
+        if (processDeviceOffloadPrf(TU->DeviceVar, TUIndex, TargetWithTU) == 0)
+          TU->Processed = 1;
+        else
+          PROF_WARN("failed to process profile data for module %p TU %d\n", 
Ptr,
+                    t);
+      }
+    }
+    MI->ModulePtr = nullptr;
+    unlockDynamicModules();
+    return;
+  }
+
+  if (isVerboseMode())
+    PROF_WARN("unregister called for unknown module %p\n", Ptr);
+  unlockDynamicModules();
+}
+
+/* Grow a void* array, doubling capacity (or starting at InitCap). */
+static int growPtrArray(void ***Arr, int *Num, int *Cap, int InitCap) {
+  if (*Num < *Cap)
+    return 0;
+  int NewCap = *Cap ? *Cap * 2 : InitCap;
+  void **New = (void **)realloc(*Arr, NewCap * sizeof(void *));
+  if (!New)
+    return -1;
+  *Arr = New;
+  *Cap = NewCap;
+  return 0;
+}
+
+static void **OffloadShadowVariables = nullptr;
+static int NumShadowVariables = 0;
+static int CapShadowVariables = 0;
+
+extern "C" void __llvm_profile_offload_register_shadow_variable(void *ptr) {
+  if (growPtrArray(&OffloadShadowVariables, &NumShadowVariables,
+                   &CapShadowVariables, 64))
+    return;
+  OffloadShadowVariables[NumShadowVariables++] = ptr;
+}
+
+static void **OffloadSectionShadowVariables = nullptr;
+static int NumSectionShadowVariables = 0;
+static int CapSectionShadowVariables = 0;
+
+extern "C" void
+__llvm_profile_offload_register_section_shadow_variable(void *ptr) {
+  if (growPtrArray(&OffloadSectionShadowVariables, &NumSectionShadowVariables,
+                   &CapSectionShadowVariables, 64))
+    return;
+  OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr;
+}
+
+namespace {
+
+// free()-based scope guard. Use .release() to transfer ownership.
+struct UniqueFree {
+  void *Ptr;
+  explicit UniqueFree(void *P = nullptr) : Ptr(P) {}
+  ~UniqueFree() { free(Ptr); }
+  UniqueFree(const UniqueFree &) = delete;
+  UniqueFree &operator=(const UniqueFree &) = delete;
+  char *get() const { return static_cast<char *>(Ptr); }
+  void reset(void *P) {
+    free(Ptr);
+    Ptr = P;
+  }
+  void *release() {
+    void *P = Ptr;
+    Ptr = nullptr;
+    return P;
+  }
+};
+
+} // namespace
+
+static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
+                                   const char *Target) {
+  __llvm_profile_gpu_sections HostSections;
+
+  if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections),
+                2 /*DToH*/) != 0) {
+    PROF_ERR("%s\n", "failed to copy offload prf structure from device");
+    return -1;
+  }
+
+  const void *DevCntsBegin = HostSections.CountersStart;
+  const void *DevDataBegin = HostSections.DataStart;
+  const void *DevNamesBegin = HostSections.NamesStart;
+  const void *DevCntsEnd = HostSections.CountersStop;
+  const void *DevDataEnd = HostSections.DataStop;
+  const void *DevNamesEnd = HostSections.NamesStop;
+
+  size_t CountersSize = (const char *)DevCntsEnd - (const char *)DevCntsBegin;
+  size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin;
+  size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin;
+
+  if (isVerboseMode())
+    PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu "
+              "Names=[%p,%p]=%zu\n",
+              DevCntsBegin, DevCntsEnd, CountersSize, DevDataBegin, DevDataEnd,
+              DataSize, DevNamesBegin, DevNamesEnd, NamesSize);
+
+  if (CountersSize == 0 || DataSize == 0)
+    return 0;
+
+  int ret = -1;
+  int NamesReused = 0, CntsReused = 0, DataReused = 0;
+
+  char *HostDataBegin = nullptr;
+  char *HostCountersBegin = nullptr;
+  char *HostNamesBegin = nullptr;
+
+  /* Sections using linker-defined __start_/__stop_ bounds are shared across
+     TU structs in RDC mode. Deduplicate by caching the last copied range. */
+  static const void *CachedDevNamesBegin = nullptr;
+  static char *CachedHostNames = nullptr;
+  static size_t CachedNamesSize = 0;
+
+  static const void *CachedDevCntsBegin = nullptr;
+  static char *CachedHostCnts = nullptr;
+  static size_t CachedCntsSize = 0;
+
+  static const void *CachedDevDataBegin = nullptr;
+  static char *CachedHostData = nullptr;
+  static size_t CachedDataSize = 0;
+
+  // Owns freshly malloc'd buffers; release() transfers ownership to the cache.
+  UniqueFree CntsOwner, DataOwner, NamesOwner;
+
+  if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin &&
+      CountersSize == CachedCntsSize) {
+    HostCountersBegin = CachedHostCnts;
+    CntsReused = 1;
+    if (isVerboseMode())
+      PROF_NOTE("Reusing cached counters section (%zu bytes)\n", CountersSize);
+  } else if (CountersSize > 0) {
+    HostCountersBegin = (char *)malloc(CountersSize);
+    CntsOwner.reset(HostCountersBegin);
+  }
+
+  if (DataSize > 0 && DevDataBegin == CachedDevDataBegin &&
+      DataSize == CachedDataSize) {
+    HostDataBegin = CachedHostData;
+    DataReused = 1;
+    if (isVerboseMode())
+      PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize);
+  } else if (DataSize > 0) {
+    HostDataBegin = (char *)malloc(DataSize);
+    DataOwner.reset(HostDataBegin);
+  }
+
+  if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin &&
+      NamesSize == CachedNamesSize) {
+    HostNamesBegin = CachedHostNames;
+    NamesReused = 1;
+    if (isVerboseMode())
+      PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize);
+  } else if (NamesSize > 0) {
+    HostNamesBegin = (char *)malloc(NamesSize);
+    NamesOwner.reset(HostNamesBegin);
+  }
+
+  if ((DataSize > 0 && !HostDataBegin) ||
+      (CountersSize > 0 && !HostCountersBegin) ||
+      (NamesSize > 0 && !HostNamesBegin)) {
+    PROF_ERR("%s\n", "failed to allocate host memory for device sections");
+    return -1;
+  }
+
+  if ((DataSize > 0 && !DataReused &&
+       memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) ||
+      (CountersSize > 0 && !CntsReused &&
+       memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) !=
+           0) ||
+      (NamesSize > 0 && !NamesReused &&
+       memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) {
+    PROF_ERR("%s\n", "failed to copy profile sections from device");
+    return -1;
+  }
+
+  /* Cache buffers so RDC-mode multi-shadow drains can reuse them.
+   * release() prevents the scope guards from freeing what the cache owns. */
+  if (!CntsReused && CountersSize > 0) {
+    CachedDevCntsBegin = DevCntsBegin;
+    CachedHostCnts = HostCountersBegin;
+    CachedCntsSize = CountersSize;
+    CntsOwner.release();
+  }
+  if (!DataReused && DataSize > 0) {
+    CachedDevDataBegin = DevDataBegin;
+    CachedHostData = HostDataBegin;
+    CachedDataSize = DataSize;
+    DataOwner.release();
+  }
+  if (!NamesReused && NamesSize > 0) {
+    CachedDevNamesBegin = DevNamesBegin;
+    CachedHostNames = HostNamesBegin;
+    CachedNamesSize = NamesSize;
+    NamesOwner.release();
+  }
+
+  if (isVerboseMode())
+    PROF_NOTE("Copied device sections: Counters=%zu, Data=%zu, Names=%zu\n",
+              CountersSize, DataSize, NamesSize);
+
+  // Arrange buffer as [Data][Padding][Counters][Names] to match the layout
+  // expected by lprofWriteDataImpl (CountersDelta = CountersBegin - 
DataBegin).
+  const uint64_t NumData = DataSize / sizeof(__llvm_profile_data);
+  const uint64_t NumBitmapBytes = 0;
+  const uint64_t VTableSectionSize = 0;
+  const uint64_t VNamesSize = 0;
+  uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSize, CountersSize, NumBitmapBytes, NamesSize, VTableSectionSize,
+          VNamesSize, &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+          &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames,
+          &PaddingBytesAfterVTable, &PaddingBytesAfterVNames) != 0) {
+    PROF_ERR("%s\n", "failed to get padding sizes");
+    return -1;
+  }
+
+  size_t ContiguousBufferSize =
+      DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize;
+  UniqueFree ContiguousBuf(malloc(ContiguousBufferSize));
+  if (!ContiguousBuf.get()) {
+    PROF_ERR("%s\n", "failed to allocate contiguous buffer");
+    return -1;
+  }
+  char *ContiguousBuffer = ContiguousBuf.get();
+  __builtin_memset(ContiguousBuffer, 0, ContiguousBufferSize);
+
+  char *BufDataBegin = ContiguousBuffer;
+  char *BufCountersBegin =
+      ContiguousBuffer + DataSize + PaddingBytesBeforeCounters;
+  char *BufNamesBegin = BufCountersBegin + CountersSize;
+
+  __builtin_memcpy(BufDataBegin, HostDataBegin, DataSize);
+  __builtin_memcpy(BufCountersBegin, HostCountersBegin, CountersSize);
+  __builtin_memcpy(BufNamesBegin, HostNamesBegin, NamesSize);
+
+  // CounterPtr is a device-relative offset; relocate it for the file layout
+  // where the Data section precedes Counters.
+  __llvm_profile_data *RelocatedData = (__llvm_profile_data *)BufDataBegin;
+  for (uint64_t i = 0; i < NumData; ++i) {
+    if (RelocatedData[i].CounterPtr) {
+      ptrdiff_t DeviceCounterPtrOffset = 
(ptrdiff_t)RelocatedData[i].CounterPtr;
+      const char *DeviceDataStructAddr =
+          (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data));
+      const char *DeviceCountersAddr =
+          DeviceDataStructAddr + DeviceCounterPtrOffset;
+      ptrdiff_t OffsetIntoCountersSection =
+          DeviceCountersAddr - (const char *)DevCntsBegin;
+
+      ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters +
+                                    OffsetIntoCountersSection -
+                                    (i * sizeof(__llvm_profile_data));
+      __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) 
+
+                           offsetof(__llvm_profile_data, CounterPtr),
+                       &NewRelativeOffset, sizeof(NewRelativeOffset));
+    }
+    __builtin_memset((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
+                         offsetof(__llvm_profile_data, BitmapPtr),
+                     0,
+                     sizeof(RelocatedData[i].BitmapPtr) +
+                         sizeof(RelocatedData[i].FunctionPointer) +
+                         sizeof(RelocatedData[i].Values));
+  }
+
+  /* Target already encodes TUIndex when needed. */
+  (void)TUIndex;
+
+  ret = __llvm_write_custom_profile(
+      Target, (__llvm_profile_data *)BufDataBegin,
+      (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin,
+      BufCountersBegin + CountersSize, BufNamesBegin, BufNamesBegin + 
NamesSize,
+      nullptr);
+
+  if (ret != 0) {
+    PROF_ERR("%s\n", "failed to write device profile using shared API");
+  } else if (isVerboseMode()) {
+    PROF_NOTE("%s\n", "Successfully wrote device profile using shared API");
+  }
+
+  return ret;
+}
+
+static int processShadowVariable(void *ShadowVar, int TUIndex,
+                                 const char *Target) {
+  void *DeviceSections = nullptr;
+  if (hipGetSymbolAddress(&DeviceSections, ShadowVar) != 0) {
+    PROF_WARN("failed to get symbol address for shadow variable %p\n",
+              ShadowVar);
+    return -1;
+  }
+  /* DeviceSections points at the per-TU sections struct itself. */
+  return processDeviceOffloadPrf(DeviceSections, TUIndex, Target);
+}
+
+static int isHipAvailable(void) {
+  ensureHipLoaded();
+  return pHipMemcpy != nullptr && pHipGetSymbolAddress != nullptr;
+}
+
+/* -------------------------------------------------------------------------- 
*/
+/*  Collect device-side profile data                                          
*/
+/* -------------------------------------------------------------------------- 
*/
+
+extern "C" int __llvm_profile_hip_collect_device_data(void) {
+  if (NumShadowVariables == 0 && NumDynamicModules == 0)
+    return 0;
+
+  if (!isHipAvailable())
+    return 0;
+
+  int Ret = 0;
+
+  /* Shadow variables (static-linked kernels): drain from every device. */
+  if (NumShadowVariables > 0) {
+    int OrigDevice = -1;
+    hipGetDevice(&OrigDevice);
+
+    for (int Dev = 0; Dev < NumDevices; ++Dev) {
+      if (hipSetDevice(Dev) != 0) {
+        if (isVerboseMode())
+          PROF_NOTE("Failed to set device %d, skipping\n", Dev);
+        continue;
+      }
+      const char *ArchName = getDeviceArchName(Dev);
+      if (isVerboseMode())
+        PROF_NOTE("Collecting static profile data from device %d (%s)\n", Dev,
+                  ArchName);
+      for (int i = 0; i < NumShadowVariables; ++i) {
+        /* RDC-mode multi-shadow drains need a distinct profraw per TU;
+         * single-TU programs keep the bare arch target. */
+        const char *Target = ArchName;
+        char TargetWithIdx[64];
+        if (NumShadowVariables > 1) {
+          snprintf(TargetWithIdx, sizeof(TargetWithIdx), "%s.%d", ArchName, i);
+          Target = TargetWithIdx;
+        }
+        if (processShadowVariable(OffloadShadowVariables[i], i, Target) != 0)
+          Ret = -1;
+      }
+    }
+
+    if (OrigDevice >= 0)
+      hipSetDevice(OrigDevice);
+  }
+
+  /* Warn about unprocessed TUs; skip cleared slots (already drained). */
+  lockDynamicModules();
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *MI = &DynamicModules[i];
+    if (!MI->ModulePtr)
+      continue;
+    for (int t = 0; t < MI->NumTUs; ++t) {
+      if (!MI->TUs[t].Processed) {
+        PROF_WARN("dynamic module %p TU %d was not processed before exit\n",
+                  MI->ModulePtr, t);
+        Ret = -1;
+      }
+    }
+  }
+  unlockDynamicModules();
+
+  if (Ret != 0)
+    PROF_WARN("%s\n", "failed to collect device profile data");
+  return Ret;
+}
+
+/* Interceptors for hipModuleLoad* / hipModuleUnload. Linux only. */
+
+#if defined(__linux__) && !defined(_WIN32)
+
+INTERCEPTOR(int, hipModuleLoad, void **module, const char *fname) {
+  int rc = REAL(hipModuleLoad)(module, fname);
+  /* Pass NULL image: no in-memory ELF is available for filename loads,
+   * so the register hook skips symbol enumeration. */
+  __llvm_profile_offload_register_dynamic_module(rc, module, nullptr);
+  return rc;
+}
+
+INTERCEPTOR(int, hipModuleLoadData, void **module, const void *image) {
+  int rc = REAL(hipModuleLoadData)(module, image);
+  __llvm_profile_offload_register_dynamic_module(rc, module, image);
+  return rc;
+}
+
+INTERCEPTOR(int, hipModuleLoadDataEx, void **module, const void *image,
+            unsigned numOptions, void **options, void **optionValues) {
+  int rc = REAL(hipModuleLoadDataEx)(module, image, numOptions, options,
+                                     optionValues);
+  __llvm_profile_offload_register_dynamic_module(rc, module, image);
+  return rc;
+}
+
+INTERCEPTOR(int, hipModuleUnload, void *module) {
+  /* Drain counters before the module is destroyed; device addresses
+   * captured at register time are invalid after unload. */
+  __llvm_profile_offload_unregister_dynamic_module(module);
+  return REAL(hipModuleUnload)(module);
+}
+
+__attribute__((constructor)) static void installHipModuleInterceptors() {
+  /* Skip when the HIP runtime is not loaded. INTERCEPT_FUNCTION uses the
+   * sanitizer interception framework, which can perturb dlsym/PLT state for
+   * the rest of the process even when the target symbol is absent; non-HIP
+   * programs linked with libclang_rt.profile.a must see zero side effects. */
+  if (!dlsym(RTLD_DEFAULT, "hipModuleLoad"))
+    return;
+  if (!INTERCEPT_FUNCTION(hipModuleLoad))
+    return;
+  if (isVerboseMode())
+    PROF_NOTE("%s", "Installing hipModuleLoad*/hipModuleUnload 
interceptors\n");
+  INTERCEPT_FUNCTION(hipModuleLoadData);
+  INTERCEPT_FUNCTION(hipModuleLoadDataEx);
+  INTERCEPT_FUNCTION(hipModuleUnload);
+}
+
+#endif /* __linux__ */
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index cbdc48a9a717f..37bad559f49e7 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -500,6 +500,9 @@ def DEOPTIMIZE : RuntimeLibcall;
 // Return address
 def RETURN_ADDRESS : RuntimeLibcall;
 
+// GPU profiling
+def PROFILE_INSTRUMENT_GPU : RuntimeLibcall;
+
 // Clear cache
 def CLEAR_CACHE : RuntimeLibcall;
 def RISCV_FLUSH_ICACHE : RuntimeLibcall;
@@ -2241,8 +2244,10 @@ def WindowsARM64ECSystemLibrary
 
 def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">;
 
-// No calls.
-def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>;
+def __llvm_profile_instrument_gpu : RuntimeLibcallImpl<PROFILE_INSTRUMENT_GPU>;
+
+def AMDGPUSystemLibrary
+    : SystemRuntimeLibrary<isAMDGPU, (add __llvm_profile_instrument_gpu)>;
 
 
//===----------------------------------------------------------------------===//
 // ARM Runtime Libcalls
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h 
b/llvm/include/llvm/ProfileData/InstrProf.h
index dffc58281c2d9..b7f917ec39b70 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -122,6 +122,11 @@ inline StringRef getInstrProfValueProfMemOpFuncName() {
 /// Return the prefix of the name of the variables to function as a filter.
 inline StringRef getInstrProfVarPrefix() { return "__prof"; }
 
+/// Return the name of the GPU wave-cooperative counter increment helper.
+inline StringRef getInstrProfInstrumentGPUFuncName() {
+  return INSTR_PROF_INSTRUMENT_GPU_FUNC_STR;
+}
+
 /// Return the name prefix of variables containing instrumented function names.
 inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; }
 
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp 
b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index dabd495cddd49..8e4ba41919768 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -33,14 +34,17 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -287,6 +291,8 @@ class InstrLowerer final {
   GlobalVariable *NamesVar = nullptr;
   size_t NamesSize = 0;
 
+  StructType *ProfileDataTy = nullptr;
+
   // vector of counter load/store pairs to be register promoted.
   std::vector<LoadStorePair> PromotionCandidates;
 
@@ -407,6 +413,9 @@ class InstrLowerer final {
   /// Create a static initializer for our data, on platforms that need it,
   /// and for any profile output file that was specified.
   void emitInitialization();
+
+  /// Return the __llvm_profile_data struct type.
+  StructType *getProfileDataTy();
 };
 
 ///
@@ -1190,19 +1199,22 @@ void InstrLowerer::lowerTimestamp(
 
 void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   auto *Addr = getCounterAddress(Inc);
-
   IRBuilder<> Builder(Inc);
   if (isGPUProfTarget(M)) {
-    auto *I64Ty = Builder.getInt64Ty();
+    auto *Int64Ty = Builder.getInt64Ty();
     auto *PtrTy = Builder.getPtrTy();
     auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()),
-                                       {PtrTy, PtrTy, I64Ty}, false);
-    auto Callee =
-        M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy);
+                                       {PtrTy, PtrTy, Int64Ty}, false);
+    FunctionCallee Callee =
+        M.getOrInsertFunction(RTLIB::RuntimeLibcallsInfo::getLibcallImplName(
+                                  RTLIB::impl___llvm_profile_instrument_gpu),
+                              CalleeTy);
     Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
     Value *Uniform =
         ConstantPointerNull::get(PointerType::getUnqual(M.getContext()));
-    Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()});
+    Value *StepI64 =
+        Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64");
+    Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64});
   } else if (Options.Atomic || AtomicCounterUpdateAll ||
              (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
     Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
@@ -1400,6 +1412,12 @@ static inline Constant *getFuncAddrForProfData(Function 
*Fn) {
   if (shouldUsePublicSymbol(Fn))
     return Fn;
 
+  // For GPU targets, weak functions cannot use private aliases because
+  // LTO may pick a different TU's copy, leaving the alias undefined
+  if (isGPUProfTarget(*Fn->getParent()) &&
+      GlobalValue::isWeakForLinker(Fn->getLinkage()))
+    return Fn;
+
   // When possible use a private alias to avoid symbolic relocations.
   auto *GA = GlobalAlias::create(GlobalValue::LinkageTypes::PrivateLinkage,
                                  Fn->getName() + ".local", Fn);
@@ -1623,11 +1641,15 @@ GlobalVariable 
*InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
   }
 
   Ptr->setVisibility(Visibility);
-  // Put the counters and bitmaps in their own sections so linkers can
-  // remove unneeded sections.
   Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat()));
   Ptr->setLinkage(Linkage);
-  maybeSetComdat(Ptr, Fn, VarName);
+  if (isGPUProfTarget(M) && !Ptr->hasComdat()) {
+    Ptr->setComdat(M.getOrInsertComdat(VarName));
+    Ptr->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    Ptr->setVisibility(GlobalValue::ProtectedVisibility);
+  } else {
+    maybeSetComdat(Ptr, Fn, VarName);
+  }
   return Ptr;
 }
 
@@ -1799,7 +1821,8 @@ void 
InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   }
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  auto *CounterPtr = PD.RegionCounters;
+
+  Constant *CounterPtr = PD.RegionCounters;
 
   uint64_t NumBitmapBytes = PD.NumBitmapBytes;
 
@@ -1807,11 +1830,7 @@ void 
InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
   auto *Int16Ty = Type::getInt16Ty(Ctx);
   auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
-  Type *DataTypes[] = {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
-#include "llvm/ProfileData/InstrProfData.inc"
-  };
-  auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+  auto *DataTy = getProfileDataTy();
 
   Constant *FunctionAddr = getFuncAddrForProfData(Fn);
 
@@ -1819,6 +1838,15 @@ void 
InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
+  if (isGPUProfTarget(M)) {
+    // For GPU targets, weak functions need weak linkage for their profile data
+    // aliases to allow linker deduplication across TUs
+    if (GlobalValue::isWeakForLinker(Fn->getLinkage()))
+      Linkage = Fn->getLinkage();
+    else
+      Linkage = GlobalValue::ExternalLinkage;
+    Visibility = GlobalValue::ProtectedVisibility;
+  }
   // If the data variable is not referenced by code (if we don't emit
   // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
   // data variable live under linker GC, the data variable can be private. This
@@ -1830,19 +1858,22 @@ void 
InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
   // that other copies must have the same CFG and cannot have value profiling.
   // If no hash suffix, other profd copies may be referenced by code.
-  if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
+  if (!isGPUProfTarget(M) && NS == 0 &&
+      !(DataReferencedByCode && NeedComdat && !Renamed) &&
       (TT.isOSBinFormatELF() ||
        (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
     Linkage = GlobalValue::PrivateLinkage;
     Visibility = GlobalValue::DefaultVisibility;
   }
-  // AMDGPU objects are always ET_DYN, so non-local symbols with default
-  // visibility are preemptible. The CounterPtr label difference emits a REL32
-  // relocation that lld rejects against preemptible targets.
-  if (TT.isAMDGPU() && !GlobalValue::isLocalLinkage(Linkage))
+  // GPU-target ELF objects are always ET_DYN, so non-local symbols with
+  // default visibility are preemptible. The CounterPtr label difference
+  // emits a REL32 relocation that lld rejects against preemptible targets.
+  if (TT.isGPU() && TT.isOSBinFormatELF() &&
+      !GlobalValue::isLocalLinkage(Linkage))
     Visibility = GlobalValue::ProtectedVisibility;
   auto *Data =
       new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName);
+
   Constant *RelativeCounterPtr;
   GlobalVariable *BitmapPtr = PD.RegionBitmaps;
   Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0);
@@ -1883,7 +1914,12 @@ void 
InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   Data->setSection(
       getInstrProfSectionName(DataSectionKind, TT.getObjectFormat()));
   Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
-  maybeSetComdat(Data, Fn, CntsVarName);
+  if (isGPUProfTarget(M) && !Data->hasComdat()) {
+    Data->setComdat(M.getOrInsertComdat(CntsVarName));
+    Data->setLinkage(GlobalValue::LinkOnceODRLinkage);
+  } else {
+    maybeSetComdat(Data, Fn, CntsVarName);
+  }
 
   PD.DataVar = Data;
 
@@ -1961,16 +1997,18 @@ void InstrLowerer::emitNameData() {
   auto &Ctx = M.getContext();
   auto *NamesVal =
       ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false);
-  NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
-                                GlobalValue::PrivateLinkage, NamesVal,
-                                getInstrProfNamesVarName());
+  std::string NamesVarName = std::string(getInstrProfNamesVarName());
+  NamesVar =
+      new GlobalVariable(M, NamesVal->getType(), true,
+                         GlobalValue::PrivateLinkage, NamesVal, NamesVarName);
 
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
-  NamesVar->setSection(
+  std::string NamesSectionName =
       ProfileCorrelate == InstrProfCorrelator::BINARY
           ? getInstrProfSectionName(IPSK_covname, TT.getObjectFormat())
-          : getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+          : getInstrProfSectionName(IPSK_name, TT.getObjectFormat());
+  NamesVar->setSection(NamesSectionName);
   // On COFF, it's important to reduce the alignment down to 1 to prevent the
   // linker from inserting padding before the start of the names section or
   // between names entries.
@@ -2179,3 +2217,22 @@ void createProfileSamplingVar(Module &M) {
   appendToCompilerUsed(M, SamplingVar);
 }
 } // namespace llvm
+
+// For GPU targets: Allocate contiguous arrays for all profile data.
+// This solves the linker reordering problem by using ONE symbol per section
+// type, so there's nothing for the linker to reorder.
+StructType *InstrLowerer::getProfileDataTy() {
+  if (ProfileDataTy)
+    return ProfileDataTy;
+
+  auto &Ctx = M.getContext();
+  auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+  Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  ProfileDataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+  return ProfileDataTy;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp 
b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index db032d6fcad45..b6d07aa821e7f 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -380,7 +380,8 @@ class FunctionInstrumenter final {
   // another counter range within the context.
   bool isValueProfilingDisabled() const {
     return DisableValueProfiling ||
-           InstrumentationType == PGOInstrumentationType::CTXPROF;
+           InstrumentationType == PGOInstrumentationType::CTXPROF ||
+           M.getTargetTriple().isGPU();
   }
 
   bool shouldInstrumentEntryBB() const {
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll 
b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
new file mode 100644
index 0000000000000..efe53ab1ebdfb
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
@@ -0,0 +1,32 @@
+;; Test basic AMDGPU PGO instrumentation lowering.
+;; Verifies that each instrumentation point lowers directly to a call to
+;; __llvm_profile_instrument_gpu with a null uniform-counter argument.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+@__hip_cuid_test01 = addrspace(1) global i8 0
+@__profn_test_kernel = private constant [11 x i8] c"test_kernel"
+
+define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out, i32 %n) {
+entry:
+  call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 
4, i32 0)
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 
4, i32 1)
+  store i32 1, ptr addrspace(1) %out
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+; CHECK-LABEL: define {{.*}} @test_kernel
+; CHECK-NOT: @__llvm_profile_sampling_gpu
+; CHECK: call void @__llvm_profile_instrument_gpu(
+; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) @__profc_test_kernel to 
ptr), ptr null, i64 1)
+; CHECK: call void @__llvm_profile_instrument_gpu(
+; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([4 x 
i64], ptr addrspace(1) @__profc_test_kernel, i32 0, i32 1) to ptr), ptr null, 
i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll 
b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll
new file mode 100644
index 0000000000000..eab78fb3591b1
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll
@@ -0,0 +1,26 @@
+;; Per-kernel __profc_* arrays land in section __llvm_prf_cnts with one slot
+;; per counter, and counter increments lower to __llvm_profile_instrument_gpu
+;; calls whose pointer argument is a GEP into the per-kernel array.
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instrprof < %s | FileCheck %s
+
+@__profn_kernel1 = private constant [7 x i8] c"kernel1"
+@__profn_kernel2 = private constant [7 x i8] c"kernel2"
+
+; CHECK: @__profc_kernel1 = linkonce_odr protected addrspace(1) global [2 x 
i64] zeroinitializer, section "__llvm_prf_cnts"
+; CHECK: @__profc_kernel2 = linkonce_odr protected addrspace(1) global [1 x 
i64] zeroinitializer, section "__llvm_prf_cnts"
+
+define amdgpu_kernel void @kernel1() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, 
i32 0)
+  call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, 
i32 1)
+  ret void
+}
+
+define amdgpu_kernel void @kernel2() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel2, i64 67890, i32 1, 
i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr 
addrspace(1) getelementptr inbounds ([2 x i64], ptr addrspace(1) 
@__profc_kernel1, i32 0, i32 1) to ptr), ptr null, i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll 
b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll
new file mode 100644
index 0000000000000..ce16f1ee3215f
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -passes=instrprof -S | FileCheck %s
+
+; Test that weak functions on GPU targets get weak linkage for their
+; __profd_ aliases to allow linker deduplication across TUs.
+; Non-weak functions get external linkage (default for aliases).
+
+target triple = "amdgcn-amd-amdhsa"
+
+@__hip_cuid_abc123 = addrspace(1) global i8 0
+
+; AMDGPU GPU profiling lowers to per-function comdat globals (not aliases).
+; CHECK: @__profd_weak_func = linkonce_odr protected addrspace(1) global
+@__profn_weak_func = private constant [9 x i8] c"weak_func"
+
+define weak void @weak_func() {
+  call void @llvm.instrprof.increment(ptr @__profn_weak_func, i64 0, i32 1, 
i32 0)
+  ret void
+}
+
+; CHECK: @__profd_weak_odr_func = linkonce_odr protected addrspace(1) global
+@__profn_weak_odr_func = private constant [13 x i8] c"weak_odr_func"
+
+define weak_odr void @weak_odr_func() {
+  call void @llvm.instrprof.increment(ptr @__profn_weak_odr_func, i64 0, i32 
1, i32 0)
+  ret void
+}
+
+; CHECK: @__profd_normal_func = linkonce_odr protected addrspace(1) global
+@__profn_normal_func = private constant [11 x i8] c"normal_func"
+
+define void @normal_func() {
+  call void @llvm.instrprof.increment(ptr @__profn_normal_func, i64 0, i32 1, 
i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
diff --git a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll 
b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
new file mode 100644
index 0000000000000..21b1d68004b13
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
@@ -0,0 +1,22 @@
+;; Test that value profiling (indirect call profiling) is disabled for GPU 
targets.
+;; The device-side profiling runtime does not implement
+;; __llvm_profile_instrument_target, so indirect call profiling must not be 
emitted.
+
+; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+@fptr = addrspace(1) global ptr null, align 8
+
+;; Verify that regular block instrumentation IS emitted
+; CHECK: call void @llvm.instrprof.increment
+
+;; Verify that value profiling for indirect calls is NOT emitted
+; CHECK-NOT: call void @llvm.instrprof.value.profile
+
+define amdgpu_kernel void @test_indirect_call() {
+entry:
+  %fp = load ptr, ptr addrspace(1) @fptr, align 8
+  call void %fp()
+  ret void
+}

>From 10cbf51f82f91f116542de39216d0d509d8cc9bd Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <[email protected]>
Date: Wed, 3 Jun 2026 15:33:59 -0400
Subject: [PATCH 2/2] Reland "[PGO][AMDGPU] Add basic HIP offload PGO support
 (#177665)"

This relands commit 5db13643f4b7 and its merged follow-ups, which were all
reverted together in 6cfa1a01a0f7 (#201416):

  635e120fb873 [PGO][HIP] Stop pulling ROCm.o into every PGO host link (#200101)
  2766733764f4 [compiler-rt][profile] Add COMPILER_RT_BUILD_PROFILE_ROCM option 
(#200127)
  4c33844b4b56 [PGO][HIP] Skip ROCm interceptor in profile-only compiler-rt 
builds (#200111)
  5eca8b67ff3c [PGO][HIP] Fix profile-only Windows link by gating ROCm 
interceptor macro (#200859)

The revert was done because the merged change still broke Windows profiling
builds with the static CRT (the interceptor merge forced the profile library
to /MD). That last regression had no merged fix yet; it is addressed by the
follow-up commit in this reland, so the series is green on Windows from the
start.

This reverts commit 6cfa1a01a0f737ed1d54963810057fb1bd67a274.
---
 compiler-rt/CMakeLists.txt              |  7 ++++++-
 compiler-rt/lib/profile/CMakeLists.txt  | 11 +++++++----
 compiler-rt/test/profile/CMakeLists.txt |  8 ++++++--
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 39034fd9ba67d..2693b35cad5c7 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -323,7 +323,12 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt 
atomic instead of libatom
 option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" 
OFF)
 
 set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON)
-if(APPLE)
+if(APPLE OR WIN32)
+  # On Windows the HIP host interceptor (InstrProfilingPlatformROCm.cpp) pulls 
in
+  # RTInterception + sanitizer_common, which are built /MD and force the whole
+  # profile library to /MD, breaking the default static-CRT (/MT) profile 
builds.
+  # Device PGO on Windows is not wired up yet, so default this off; users who 
need
+  # it can opt in explicitly and accept the /MD profile library.
   set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF)
 endif()
 option(COMPILER_RT_BUILD_PROFILE_ROCM
diff --git a/compiler-rt/lib/profile/CMakeLists.txt 
b/compiler-rt/lib/profile/CMakeLists.txt
index 77db2477bb7c6..02c94cd2c4db2 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -161,12 +161,15 @@ endif()
 # The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in
 # RTInterception + sanitizer_common object libs. Those targets are only created
 # when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is 
enabled
-# (see lib/CMakeLists.txt). In a profile-only build the targets do not exist;
-# skip both the object-lib merge and the ROCm source file so the static archive
-# remains self-contained.
+# (see lib/CMakeLists.txt). Only merge them (and keep the ROCm source) when 
ROCm
+# device PGO is requested and the targets exist; otherwise skip both so the
+# static archive stays self-contained. This also keeps the profile library on
+# the static CRT (/MT) on Windows by default, since the merged sanitizer object
+# libs are built /MD.
 set(PROFILE_OBJECT_LIBS)
 set(PROFILE_HAS_HIP_INTERCEPTOR FALSE)
-if(COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL
+if(COMPILER_RT_BUILD_PROFILE_ROCM
+   AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL
    AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH}
    AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH}
    AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH})
diff --git a/compiler-rt/test/profile/CMakeLists.txt 
b/compiler-rt/test/profile/CMakeLists.txt
index 213a05032ed80..7e79c054328a8 100644
--- a/compiler-rt/test/profile/CMakeLists.txt
+++ b/compiler-rt/test/profile/CMakeLists.txt
@@ -28,8 +28,12 @@ foreach(arch ${PROFILE_TEST_ARCH})
   # with MultiThreadedDLL (/MD), so the .objs reference __imp_* symbols;
   # the test binary defaults to /MT and fails to link (LNK2019 __imp__stricmp
   # from interception_win.cpp, LNK4098 default-lib conflicts). Match the
-  # DLL CRT here so test executables link against the same runtime.
-  if(MSVC AND COMPILER_RT_HAS_INTERCEPTION AND NOT 
COMPILER_RT_PROFILE_BAREMETAL)
+  # DLL CRT here so test executables link against the same runtime. This must
+  # mirror the archive's interceptor gate in lib/profile/CMakeLists.txt: the
+  # libs are only merged (and the archive only goes /MD) when ROCm device PGO
+  # is enabled, so without it the test binaries stay on the default /MT.
+  if(MSVC AND COMPILER_RT_BUILD_PROFILE_ROCM
+     AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL)
     string(APPEND PROFILE_TEST_TARGET_CFLAGS
            " -D_MT -D_DLL 
-Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames")
   endif()

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [compiler-rt] [llvm] Reland "[PGO][AMDGPU] Add basic HIP offload PGO support (#177665)" (PR #201499)

Reply via email to