https://github.com/yxsamliu updated 
https://github.com/llvm/llvm-project/pull/202095

>From 30db24a96dcaeb9885429769b0268694acc46167 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <[email protected]>
Date: Sun, 7 Jun 2026 16:48:15 -0400
Subject: [PATCH 1/3] [PGO][HIP] Fix HIP device profile collection and sections
 emission

Several related HIP device-PGO fixes:

Windows device collection. HIP rejects a hipMemcpy that reads past the bounds
of a symbol registered with __hipRegisterVar, but device data/counters/names
live in merged linker sections. Register a separate shadow for each device
data, counters, and names symbol and copy each one by its exact hipGetSymbolSize
size; this also lets static TUs with several kernels keep all their profile
data. Open the device profile file in binary mode and pass the device names to
the correct lprofWriteDataImpl arguments so llvm-profdata can read the raw
profile. Open the versioned amdhip64_7.dll first, falling back to amdhip64.dll.

Per-TU sections struct. Clang CodeGen emitted the __llvm_profile_sections_<CUID>
struct (and its section start/stop references) for any profiling-enabled device
TU. A TU with no instrumented device functions then referenced sections nothing
populates, so the RDC device link failed under --no-undefined (and duplicated
__llvm_prf_nm before per-CUID naming). Move the struct emission from CGCUDANV
into the InstrProfiling pass, which emits it only when the TU has profile data;
clang emits only the per-TU names-postfix marker, also making names unique per
TU so RDC builds do not clash.

Dynamic-module interceptors. The hipModuleLoad* interceptors live in a
constructor-only object in clang_rt.profile_rocm that nothing references, so the
linker drops it and dynamic-module programs collect no device profile. When
linking clang_rt.profile_rocm, emit a force-link reference (-u on ELF,
-include: on COFF); the constructor self-skips when the program does not use
hipModuleLoad.

Depends on #201607 (reland HIP offload PGO compiler support and link the
device-profile runtime); that PR must land first.
---
 clang/lib/CodeGen/CGCUDANV.cpp                | 113 +++--
 clang/lib/Driver/ToolChains/Linux.cpp         |   7 +-
 clang/lib/Driver/ToolChains/MSVC.cpp          |   8 +-
 .../test/CodeGenHIP/offload-pgo-sections.hip  |  54 ++-
 .../test/Driver/hip-profile-rocm-runtime.hip  |   2 +
 compiler-rt/lib/profile/InstrProfilingFile.c  |  11 +-
 .../profile/InstrProfilingPlatformROCm.cpp    | 433 ++++++++++++++----
 llvm/include/llvm/ProfileData/InstrProf.h     |   4 +
 .../Instrumentation/InstrProfiling.cpp        |  77 +++-
 9 files changed, 560 insertions(+), 149 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 65f398af7902b..f19813db189ef 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/ReplaceConstant.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -37,6 +38,10 @@ namespace {
 constexpr unsigned CudaFatMagic = 0x466243b1;
 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
+static std::string getOffloadProfilingNamesVarName(llvm::StringRef CUIDHash) {
+  return (llvm::Twine(llvm::getInstrProfNamesVarName()) + "_" + 
CUIDHash).str();
+}
+
 class CGNVCUDARuntime : public CGCUDARuntime {
 
   /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
@@ -78,6 +83,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
   /// can locate the device-side table by name.
   llvm::GlobalVariable *OffloadProfShadow = nullptr;
+  struct OffloadProfSectionShadowInfo {
+    llvm::GlobalVariable *Shadow;
+    std::string DeviceName;
+  };
+  llvm::SmallVector<OffloadProfSectionShadowInfo, 16> 
OffloadProfSectionShadows;
   /// Whether we generate relocatable device code.
   bool RelocatableDeviceCode;
   /// Mangle context for device.
@@ -774,6 +784,27 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
   }
 
+  if (!OffloadProfSectionShadows.empty()) {
+    llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_section_shadow_variable");
+    for (const auto &Info : OffloadProfSectionShadows) {
+      llvm::Constant *Name = makeConstantString(Info.DeviceName);
+      llvm::Value *RegisterVarArgs[] = {
+          &GpuBinaryHandlePtr,
+          Info.Shadow,
+          Name,
+          Name,
+          llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+          llvm::ConstantInt::get(VarSizeTy,
+                                 CGM.getDataLayout().getPointerSize()),
+          llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+          llvm::ConstantInt::get(IntTy, 0)};
+      Builder.CreateCall(RegisterVar, RegisterVarArgs);
+      Builder.CreateCall(RegisterSectionShadow, {Info.Shadow});
+    }
+  }
+
   Builder.CreateRetVoid();
   return RegisterKernelsFunc;
 }
@@ -1311,6 +1342,9 @@ void CGNVCUDARuntime::createOffloadingEntries() {
     llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(VoidTy, {PtrTy}, false),
         "__llvm_profile_offload_register_shadow_variable");
+    llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_section_shadow_variable");
     auto *CtorFn = llvm::Function::Create(
         llvm::FunctionType::get(VoidTy, false),
         llvm::GlobalValue::InternalLinkage,
@@ -1318,6 +1352,13 @@ void CGNVCUDARuntime::createOffloadingEntries() {
     auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
     llvm::IRBuilder<> B(Entry);
     B.CreateCall(RegisterShadow, {OffloadProfShadow});
+    for (const auto &Info : OffloadProfSectionShadows) {
+      llvm::offloading::emitOffloadingEntry(
+          M, Kind, Info.Shadow, Info.DeviceName,
+          CGM.getDataLayout().getPointerSize(),
+          llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+      B.CreateCall(RegisterSectionShadow, {Info.Shadow});
+    }
     B.CreateRetVoid();
     llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
   }
@@ -1350,50 +1391,23 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() {
     return;
 
   if (CGM.getLangOpts().CUDAIsDevice) {
-    // Device side: emit the populated struct. Section start/stop symbols
-    // are linker-defined (ELF auto-generates __start_/__stop_ for any
-    // section whose name is a valid C identifier; AMDGPU is ELF).
+    // Device side: emit only the per-TU names postfix marker. The sections
+    // struct is emitted later by the InstrProfiling pass, which emits it only
+    // when the TU has profile data, avoiding dangling section references.
     unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
-    auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
-    auto getOrDeclare = [&](StringRef SymName) {
-      if (auto *GV = M.getNamedGlobal(SymName))
-        return GV;
-      auto *GV = new llvm::GlobalVariable(
-          M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
-          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
-          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
-          GlobalAS);
-      GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
-      return GV;
-    };
-    auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
-    if (!VersionGV) {
-      VersionGV = new llvm::GlobalVariable(
-          M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
-          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
-          "__llvm_profile_raw_version",
+    std::string NamesVarPostfixVarName =
+        std::string(llvm::getInstrProfNamesVarPostfixVarName());
+    if (!M.getNamedValue(NamesVarPostfixVarName)) {
+      auto *NamesVarPostfix = llvm::ConstantDataArray::getString(
+          Ctx, (llvm::Twine("_") + CUIDHash).str(), true);
+      auto *NamesGV = new llvm::GlobalVariable(
+          M, NamesVarPostfix->getType(), /*isConstant=*/true,
+          llvm::GlobalValue::PrivateLinkage, NamesVarPostfix,
+          NamesVarPostfixVarName,
           /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
           GlobalAS);
+      CGM.addCompilerUsedGlobal(NamesGV);
     }
-
-    auto *StructTy = llvm::StructType::get(
-        Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
-    llvm::Constant *Fields[] = {
-        getOrDeclare("__start___llvm_prf_names"),
-        getOrDeclare("__stop___llvm_prf_names"),
-        getOrDeclare("__start___llvm_prf_cnts"),
-        getOrDeclare("__stop___llvm_prf_cnts"),
-        getOrDeclare("__start___llvm_prf_data"),
-        getOrDeclare("__stop___llvm_prf_data"),
-        VersionGV,
-    };
-    auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
-    auto *GV = new llvm::GlobalVariable(
-        M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
-        Init, Name, /*InsertBefore=*/nullptr, 
llvm::GlobalValue::NotThreadLocal,
-        GlobalAS);
-    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
-    CGM.addCompilerUsedGlobal(GV);
     return;
   }
 
@@ -1407,6 +1421,25 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() {
       M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
       llvm::ConstantPointerNull::get(PtrTy), Name);
   CGM.addCompilerUsedGlobal(OffloadProfShadow);
+
+  auto AddSectionShadow = [&](StringRef Kind, StringRef DeviceName) {
+    std::string ShadowName =
+        (Twine("__llvm_profile_shadow_") + Kind + "_" + CUIDHash + "_" +
+         Twine(OffloadProfSectionShadows.size()))
+            .str();
+    auto *Shadow = new llvm::GlobalVariable(
+        M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+        llvm::ConstantPointerNull::get(PtrTy), ShadowName);
+    CGM.addCompilerUsedGlobal(Shadow);
+    OffloadProfSectionShadows.push_back({Shadow, DeviceName.str()});
+  };
+
+  for (auto &&I : EmittedKernels) {
+    std::string KernelName = getDeviceSideName(cast<NamedDecl>(I.D));
+    AddSectionShadow("data", std::string("__profd_") + KernelName);
+    AddSectionShadow("cnts", std::string("__profc_") + KernelName);
+    AddSectionShadow("names", getOffloadProfilingNamesVarName(CUIDHash));
+  }
 }
 
 // Returns module constructor to be added.
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp 
b/clang/lib/Driver/ToolChains/Linux.cpp
index c0f44d74a9aac..512788d235fec 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -909,8 +909,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const 
ArgList &Args,
   // self-contained superset of clang_rt.profile, emitted first so the base
   // archive stays inert.
   if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) &&
-      getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+      getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) {
     CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
+    // Force-retain the constructor-only hipModuleLoad* interceptor object; its
+    // constructor self-skips when the program does not use hipModuleLoad.
+    CmdArgs.push_back("-u");
+    CmdArgs.push_back("__llvm_profile_offload_register_dynamic_module");
+  }
 }
 
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp 
b/clang/lib/Driver/ToolChains/MSVC.cpp
index 3c3bfe33b9f07..0796bdff96d46 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -603,8 +603,14 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, 
const ArgList &Args,
     // self-contained superset of clang_rt.profile, emitted first so the base
     // archive stays inert (avoiding a /MD-vs-/MT CRT mix in the host image).
     if (needsProfileRT(Args) &&
-        getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+        getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) {
       CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
+      // Force the linker to retain the constructor-only hipModuleLoad*
+      // interceptor object from clang_rt.profile_rocm (see Linux.cpp). The
+      // constructor self-skips for programs that do not use hipModuleLoad.
+      CmdArgs.push_back(
+          "-include:__llvm_profile_offload_register_dynamic_module");
+    }
   }
 }
 
diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip 
b/clang/test/CodeGenHIP/offload-pgo-sections.hip
index 17c6fe7b9e609..d21ba30012212 100644
--- a/clang/test/CodeGenHIP/offload-pgo-sections.hip
+++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip
@@ -1,18 +1,26 @@
 // REQUIRES: amdgpu-registered-target
 // REQUIRES: x86-registered-target
 
-// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global
-// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct
-// in addrspace(1). Host compile: void* shadow registered with the HIP
-// runtime and with the profile runtime's drain list.
+// Verify the per-TU __llvm_profile_sections_<CUID> global for HIP+PGO.
+// Device side: clang emits the names-postfix marker, and the InstrProfiling
+// pass emits the populated 7-pointer struct in addrspace(1) -- but only when
+// the TU actually has profile data records. Host compile: void* shadow
+// registered with the HIP runtime and the profile runtime's drain list.
 
+// The device struct is emitted by the InstrProfiling pass (not clang codegen),
+// so run the pass to observe it.
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
 // RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | opt -passes=instrprof -S \
 // RUN:   | FileCheck -check-prefix=DEV %s
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \
 // RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefix=HOST %s
+//
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fgpu-rdc --offload-new-driver \
+// RUN:   -cuid=abc -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=HOST-RDC %s
 
 // Guard: no PGO -> no emission.
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
@@ -24,11 +32,25 @@
 // RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefix=NONE %s
 
+// Guard: PGO on but no instrumented device functions (all device code is
+// constexpr/host-only) -> the pass must not emit the sections struct, so its
+// section references don't dangle at the device link.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip \
+// RUN:   -DEMPTY_DEVICE %s \
+// RUN:   | opt -passes=instrprof -S \
+// RUN:   | FileCheck -check-prefix=EMPTY %s
+
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
 
+#ifdef EMPTY_DEVICE
+// No __global__/instrumented device function: device code folds away.
+__device__ constexpr int dead(int x) { return x + 1; }
+#else
 __device__ int helper(int x) { return x + 1; }
 __global__ void kernel(int *p) { *p = helper(*p); }
+#endif
 
 // DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8
 // DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8
@@ -37,14 +59,34 @@ __global__ void kernel(int *p) { *p = helper(*p); }
 // DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8
 // DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8
 // DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64
-// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) 
constant 
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
-// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+// DEV-DAG: @__llvm_prf_nm_[[CUID:[0-9a-f]+]] = protected addrspace(1) 
constant {{.*}}section "__llvm_prf_names"
+// DEV-DAG: @__llvm_profile_sections_[[CUID]] = protected addrspace(1) 
constant 
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
+// DEV-DAG: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
 
 // HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST-DAG: @__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}} = global ptr null
 // HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
 // HOST: define internal void @__hip_register_globals
 // HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]],
 // HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+// HOST: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}})
+
+// HOST-RDC: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST-RDC-DAG: @__llvm_profile_shadow_data_[[CUID]]_0 = global ptr null
+// HOST-RDC-DAG: @__llvm_profile_shadow_cnts_[[CUID]]_1 = global ptr null
+// HOST-RDC-DAG: @__llvm_profile_shadow_names_[[CUID]]_2 = global ptr null
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_sections_[[CUID]]
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_data_[[CUID]]_0
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_cnts_[[CUID]]_1
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_names_[[CUID]]_2
+// HOST-RDC: define internal void @__llvm_profile_register_shadow.[[CUID]]()
+// HOST-RDC: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+// HOST-RDC-DAG: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_data_[[CUID]]_0)
+// HOST-RDC-DAG: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_cnts_[[CUID]]_1)
+// HOST-RDC-DAG: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_names_[[CUID]]_2)
 
 // NONE-NOT: __llvm_profile_sections_
 // NONE-NOT: __llvm_profile_offload_register_shadow_variable
+
+// EMPTY-NOT: @__llvm_profile_sections_
+// EMPTY-NOT: @__start___llvm_prf_data
diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip 
b/clang/test/Driver/hip-profile-rocm-runtime.hip
index 5e99d3f4f2fee..fc82db4fc13c0 100644
--- a/clang/test/Driver/hip-profile-rocm-runtime.hip
+++ b/clang/test/Driver/hip-profile-rocm-runtime.hip
@@ -15,6 +15,8 @@
 // RUN:   | FileCheck -check-prefix=HIP-PGO %s
 // profile_rocm must precede the base profile so the base archive stays inert.
 // HIP-PGO: "{{.*}}libclang_rt.profile_rocm.a"
+// The constructor-only hipModuleLoad* interceptor object is force-linked.
+// HIP-PGO: "-u" "__llvm_profile_offload_register_dynamic_module"
 // HIP-PGO: "{{.*}}libclang_rt.profile.a"
 
 // Without profiling, the ROCm device-profile runtime is not linked.
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c 
b/compiler-rt/lib/profile/InstrProfilingFile.c
index 9ea5a2638fac9..98a524392e54a 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1383,7 +1383,7 @@ int __llvm_write_custom_profile(const char *Target,
   TargetFilename[FilenameLength + 1 + TargetLength] = 0;
 
   /* Open and truncate target-specific PGO file */
-  FILE *OutputFile = fopen(TargetFilename, "w");
+  FILE *OutputFile = fopen(TargetFilename, "wb");
   setProfileFile(OutputFile);
 
   if (!OutputFile) {
@@ -1404,11 +1404,10 @@ int __llvm_write_custom_profile(const char *Target,
   if (VersionOverride)
     Version = *VersionOverride;
 
-  /* Write custom data to the file */
-  ReturnValue =
-      lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin,
-                         CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL,
-                         NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version);
+  ReturnValue = lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd,
+                                   CountersBegin, CountersEnd, NULL, NULL,
+                                   lprofGetVPDataReader(), NamesBegin, 
NamesEnd,
+                                   NULL, NULL, NULL, NULL, 0, Version);
   closeFileObject(OutputFile);
 
   // Restore SIGKILL.
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp 
b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
index ee00c572e3a42..e972f22fdae8f 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
@@ -57,8 +57,9 @@ static void unlockDynamicModules(void) {
 }
 #endif
 
-static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
-                                   const char *Target);
+struct OffloadSectionShadowGroup;
+static int processDeviceOffloadPrf(void *DeviceOffloadPrf, const char *Target,
+                                   const OffloadSectionShadowGroup *Sections);
 
 static int isVerboseMode() {
   static int IsVerbose = -1;
@@ -72,6 +73,7 @@ static int isVerboseMode() {
 /* -------------------------------------------------------------------------- 
*/
 
 typedef int (*hipGetSymbolAddressTy)(void **, const void *);
+typedef int (*hipGetSymbolSizeTy)(size_t *, const void *);
 typedef int (*hipMemcpyTy)(void *, const void *, size_t, int);
 typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *);
 typedef int (*hipGetDeviceCountTy)(int *);
@@ -88,6 +90,7 @@ typedef struct {
 typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int);
 
 static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr;
+static hipGetSymbolSizeTy pHipGetSymbolSize = nullptr;
 static hipMemcpyTy pHipMemcpy = nullptr;
 static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr;
 static hipGetDeviceCountTy pHipGetDeviceCount = nullptr;
@@ -113,17 +116,25 @@ static void doEnsureHipLoaded(void) {
   }
 
 #ifdef _WIN32
-  static const char HipLibName[] = "amdhip64.dll";
+  const char *HipLibName = "amdhip64_7.dll";
 #else
-  static const char HipLibName[] = "libamdhip64.so";
+  const char *HipLibName = "libamdhip64.so";
 #endif
 
   void *Handle = __interception::OpenLibrary(HipLibName);
+#ifdef _WIN32
+  if (!Handle) {
+    HipLibName = "amdhip64.dll";
+    Handle = __interception::OpenLibrary(HipLibName);
+  }
+#endif
   if (!Handle)
     return;
 
   pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol(
       Handle, "hipGetSymbolAddress");
+  pHipGetSymbolSize = (hipGetSymbolSizeTy)__interception::LookupSymbol(
+      Handle, "hipGetSymbolSize");
   pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy");
   pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol(
       Handle, "hipModuleGetGlobal");
@@ -189,6 +200,11 @@ static int hipGetSymbolAddress(void **devPtr, const void 
*symbol) {
   return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1;
 }
 
+static int hipGetSymbolSize(size_t *size, const void *symbol) {
+  ensureHipLoaded();
+  return pHipGetSymbolSize ? pHipGetSymbolSize(size, symbol) : -1;
+}
+
 static int hipMemcpy(void *dest, const void *src, size_t len,
                      int kind /*2=DToH*/) {
   ensureHipLoaded();
@@ -471,7 +487,7 @@ extern "C" void 
__llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
         char TargetWithTU[64];
         snprintf(TargetWithTU, sizeof(TargetWithTU), "%s.%d", ArchName,
                  TUIndex);
-        if (processDeviceOffloadPrf(TU->DeviceVar, TUIndex, TargetWithTU) == 0)
+        if (processDeviceOffloadPrf(TU->DeviceVar, TargetWithTU, nullptr) == 0)
           TU->Processed = 1;
         else
           PROF_WARN("failed to process profile data for module %p TU %d\n", 
Ptr,
@@ -505,23 +521,93 @@ static void **OffloadShadowVariables = nullptr;
 static int NumShadowVariables = 0;
 static int CapShadowVariables = 0;
 
+struct OffloadSectionShadow {
+  void *Data;
+  void *Counters;
+  void *Names;
+};
+
+struct OffloadSectionShadowGroup {
+  OffloadSectionShadow *Shadows;
+  int NumShadows;
+  int CapShadows;
+  int NumSections;
+};
+
+static OffloadSectionShadowGroup *OffloadSectionShadowGroups = nullptr;
+static int CapSectionShadowGroups = 0;
+
+static int ensureSectionShadowGroupCapacity(void) {
+  if (CapSectionShadowGroups >= CapShadowVariables)
+    return 0;
+  OffloadSectionShadowGroup *New = (OffloadSectionShadowGroup *)realloc(
+      OffloadSectionShadowGroups, CapShadowVariables * sizeof(*New));
+  if (!New)
+    return -1;
+  __builtin_memset(New + CapSectionShadowGroups, 0,
+                   (CapShadowVariables - CapSectionShadowGroups) *
+                       sizeof(*New));
+  OffloadSectionShadowGroups = New;
+  CapSectionShadowGroups = CapShadowVariables;
+  return 0;
+}
+
+static int ensureSectionShadowCapacity(OffloadSectionShadowGroup *Group,
+                                       int MinCapacity) {
+  if (Group->CapShadows >= MinCapacity)
+    return 0;
+  int NewCap = Group->CapShadows ? Group->CapShadows * 2 : 4;
+  while (NewCap < MinCapacity)
+    NewCap *= 2;
+  OffloadSectionShadow *New =
+      (OffloadSectionShadow *)realloc(Group->Shadows, NewCap * sizeof(*New));
+  if (!New)
+    return -1;
+  __builtin_memset(New + Group->CapShadows, 0,
+                   (NewCap - Group->CapShadows) * sizeof(*New));
+  Group->Shadows = New;
+  Group->CapShadows = NewCap;
+  return 0;
+}
+
 extern "C" void __llvm_profile_offload_register_shadow_variable(void *ptr) {
   if (growPtrArray(&OffloadShadowVariables, &NumShadowVariables,
                    &CapShadowVariables, 64))
     return;
-  OffloadShadowVariables[NumShadowVariables++] = ptr;
+  if (ensureSectionShadowGroupCapacity())
+    return;
+  int Index = NumShadowVariables++;
+  OffloadShadowVariables[Index] = ptr;
+  __builtin_memset(&OffloadSectionShadowGroups[Index], 0,
+                   sizeof(OffloadSectionShadowGroups[Index]));
 }
 
-static void **OffloadSectionShadowVariables = nullptr;
-static int NumSectionShadowVariables = 0;
-static int CapSectionShadowVariables = 0;
-
 extern "C" void
 __llvm_profile_offload_register_section_shadow_variable(void *ptr) {
-  if (growPtrArray(&OffloadSectionShadowVariables, &NumSectionShadowVariables,
-                   &CapSectionShadowVariables, 64))
+  if (NumShadowVariables == 0)
     return;
-  OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr;
+
+  OffloadSectionShadowGroup *Group =
+      &OffloadSectionShadowGroups[NumShadowVariables - 1];
+  int ShadowIndex = Group->NumSections / 3;
+  if (ensureSectionShadowCapacity(Group, ShadowIndex + 1))
+    return;
+  if (ShadowIndex >= Group->NumShadows)
+    Group->NumShadows = ShadowIndex + 1;
+
+  OffloadSectionShadow *Shadow = &Group->Shadows[ShadowIndex];
+  switch (Group->NumSections % 3) {
+  case 0:
+    Shadow->Data = ptr;
+    break;
+  case 1:
+    Shadow->Counters = ptr;
+    break;
+  case 2:
+    Shadow->Names = ptr;
+    break;
+  }
+  ++Group->NumSections;
 }
 
 namespace {
@@ -547,8 +633,41 @@ struct UniqueFree {
 
 } // namespace
 
-static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
-                                   const char *Target) {
+static int getRegisteredSectionBounds(void *Shadow, void **DevicePtr,
+                                      size_t *Size) {
+  *DevicePtr = nullptr;
+  *Size = 0;
+  int AddrRc = hipGetSymbolAddress(DevicePtr, Shadow);
+  int SizeRc = hipGetSymbolSize(Size, Shadow);
+  return AddrRc == 0 && SizeRc == 0 && *DevicePtr && *Size > 0 ? 0 : -1;
+}
+
+struct RegisteredSectionRange {
+  const void *Data;
+  const void *Counters;
+  const void *Names;
+  size_t DataSize;
+  size_t CountersSize;
+  size_t NamesSize;
+  size_t DataOffset;
+  size_t CountersOffset;
+  size_t NamesOffset;
+};
+
+static int
+hasCompleteSectionShadows(const OffloadSectionShadowGroup *Sections) {
+  if (!Sections || Sections->NumShadows == 0 || Sections->NumSections % 3 != 0)
+    return 0;
+  for (int I = 0; I < Sections->NumShadows; ++I) {
+    if (!Sections->Shadows[I].Data || !Sections->Shadows[I].Counters ||
+        !Sections->Shadows[I].Names)
+      return 0;
+  }
+  return 1;
+}
+
+static int processDeviceOffloadPrf(void *DeviceOffloadPrf, const char *Target,
+                                   const OffloadSectionShadowGroup *Sections) {
   __llvm_profile_gpu_sections HostSections;
 
   if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections),
@@ -568,6 +687,10 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, 
int TUIndex,
   size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin;
   size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin;
 
+  int UseRegisteredSections = hasCompleteSectionShadows(Sections);
+  RegisteredSectionRange *RegisteredRanges = nullptr;
+  int NumRegisteredRanges = 0;
+
   if (isVerboseMode())
     PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu "
               "Names=[%p,%p]=%zu\n",
@@ -599,78 +722,180 @@ static int processDeviceOffloadPrf(void 
*DeviceOffloadPrf, int TUIndex,
   static size_t CachedDataSize = 0;
 
   // Owns freshly malloc'd buffers; release() transfers ownership to the cache.
-  UniqueFree CntsOwner, DataOwner, NamesOwner;
+  UniqueFree CntsOwner, DataOwner, NamesOwner, RegisteredRangeOwner;
+
+  if (UseRegisteredSections) {
+    NumRegisteredRanges = Sections->NumShadows;
+    RegisteredRangeOwner.reset(
+        malloc(NumRegisteredRanges * sizeof(RegisteredSectionRange)));
+    RegisteredRanges = (RegisteredSectionRange *)RegisteredRangeOwner.get();
+    if (!RegisteredRanges) {
+      PROF_ERR("%s\n", "failed to allocate registered section table");
+      return -1;
+    }
+    __builtin_memset(RegisteredRanges, 0,
+                     NumRegisteredRanges * sizeof(*RegisteredRanges));
+
+    size_t RegisteredDataSize = 0;
+    size_t RegisteredCountersSize = 0;
+    size_t RegisteredNamesSize = 0;
+    for (int I = 0; I < NumRegisteredRanges; ++I) {
+      void *Data = nullptr;
+      void *Counters = nullptr;
+      void *Names = nullptr;
+      size_t ThisDataSize = 0;
+      size_t ThisCountersSize = 0;
+      size_t ThisNamesSize = 0;
+      OffloadSectionShadow *Shadow = &Sections->Shadows[I];
+      if (getRegisteredSectionBounds(Shadow->Data, &Data, &ThisDataSize) != 0 
||
+          getRegisteredSectionBounds(Shadow->Counters, &Counters,
+                                     &ThisCountersSize) != 0 ||
+          getRegisteredSectionBounds(Shadow->Names, &Names, &ThisNamesSize) !=
+              0) {
+        PROF_ERR("%s\n", "failed to get registered section bounds");
+        return -1;
+      }
 
-  if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin &&
-      CountersSize == CachedCntsSize) {
-    HostCountersBegin = CachedHostCnts;
-    CntsReused = 1;
-    if (isVerboseMode())
-      PROF_NOTE("Reusing cached counters section (%zu bytes)\n", CountersSize);
-  } else if (CountersSize > 0) {
-    HostCountersBegin = (char *)malloc(CountersSize);
-    CntsOwner.reset(HostCountersBegin);
-  }
+      RegisteredRanges[I].Data = Data;
+      RegisteredRanges[I].Counters = Counters;
+      RegisteredRanges[I].Names = Names;
+      RegisteredRanges[I].DataSize = ThisDataSize;
+      RegisteredRanges[I].CountersSize = ThisCountersSize;
+      RegisteredRanges[I].NamesSize = ThisNamesSize;
+      RegisteredRanges[I].DataOffset = RegisteredDataSize;
+      RegisteredRanges[I].CountersOffset = RegisteredCountersSize;
+      RegisteredDataSize += ThisDataSize;
+      RegisteredCountersSize += ThisCountersSize;
+
+      int ReuseNames = 0;
+      for (int J = 0; J < I; ++J) {
+        if (RegisteredRanges[J].Names == Names &&
+            RegisteredRanges[J].NamesSize == ThisNamesSize) {
+          RegisteredRanges[I].NamesOffset = RegisteredRanges[J].NamesOffset;
+          ReuseNames = 1;
+          break;
+        }
+      }
+      if (!ReuseNames) {
+        RegisteredRanges[I].NamesOffset = RegisteredNamesSize;
+        RegisteredNamesSize += ThisNamesSize;
+      }
+    }
 
-  if (DataSize > 0 && DevDataBegin == CachedDevDataBegin &&
-      DataSize == CachedDataSize) {
-    HostDataBegin = CachedHostData;
-    DataReused = 1;
-    if (isVerboseMode())
-      PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize);
-  } else if (DataSize > 0) {
+    DataSize = RegisteredDataSize;
+    CountersSize = RegisteredCountersSize;
+    NamesSize = RegisteredNamesSize;
     HostDataBegin = (char *)malloc(DataSize);
+    HostCountersBegin = (char *)malloc(CountersSize);
+    HostNamesBegin = NamesSize ? (char *)malloc(NamesSize) : nullptr;
     DataOwner.reset(HostDataBegin);
-  }
-
-  if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin &&
-      NamesSize == CachedNamesSize) {
-    HostNamesBegin = CachedHostNames;
-    NamesReused = 1;
-    if (isVerboseMode())
-      PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize);
-  } else if (NamesSize > 0) {
-    HostNamesBegin = (char *)malloc(NamesSize);
+    CntsOwner.reset(HostCountersBegin);
     NamesOwner.reset(HostNamesBegin);
-  }
+    if ((DataSize > 0 && !HostDataBegin) ||
+        (CountersSize > 0 && !HostCountersBegin) ||
+        (NamesSize > 0 && !HostNamesBegin)) {
+      PROF_ERR("%s\n", "failed to allocate host memory for device sections");
+      return -1;
+    }
 
-  if ((DataSize > 0 && !HostDataBegin) ||
-      (CountersSize > 0 && !HostCountersBegin) ||
-      (NamesSize > 0 && !HostNamesBegin)) {
-    PROF_ERR("%s\n", "failed to allocate host memory for device sections");
-    return -1;
-  }
+    for (int I = 0; I < NumRegisteredRanges; ++I) {
+      RegisteredSectionRange *R = &RegisteredRanges[I];
+      if (memcpyDeviceToHost(HostDataBegin + R->DataOffset, R->Data,
+                             R->DataSize) != 0 ||
+          memcpyDeviceToHost(HostCountersBegin + R->CountersOffset, 
R->Counters,
+                             R->CountersSize) != 0) {
+        PROF_ERR("%s\n", "failed to copy profile sections from device");
+        return -1;
+      }
 
-  if ((DataSize > 0 && !DataReused &&
-       memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) ||
-      (CountersSize > 0 && !CntsReused &&
-       memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) !=
-           0) ||
-      (NamesSize > 0 && !NamesReused &&
-       memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) {
-    PROF_ERR("%s\n", "failed to copy profile sections from device");
-    return -1;
-  }
+      int CopyNames = 1;
+      for (int J = 0; J < I; ++J) {
+        if (RegisteredRanges[J].Names == R->Names &&
+            RegisteredRanges[J].NamesSize == R->NamesSize) {
+          CopyNames = 0;
+          break;
+        }
+      }
+      if (CopyNames && R->NamesSize > 0 &&
+          memcpyDeviceToHost(HostNamesBegin + R->NamesOffset, R->Names,
+                             R->NamesSize) != 0) {
+        PROF_ERR("%s\n", "failed to copy profile sections from device");
+        return -1;
+      }
+    }
+  } else {
+    if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin &&
+        CountersSize == CachedCntsSize) {
+      HostCountersBegin = CachedHostCnts;
+      CntsReused = 1;
+      if (isVerboseMode())
+        PROF_NOTE("Reusing cached counters section (%zu bytes)\n",
+                  CountersSize);
+    } else if (CountersSize > 0) {
+      HostCountersBegin = (char *)malloc(CountersSize);
+      CntsOwner.reset(HostCountersBegin);
+    }
 
-  /* Cache buffers so RDC-mode multi-shadow drains can reuse them.
-   * release() prevents the scope guards from freeing what the cache owns. */
-  if (!CntsReused && CountersSize > 0) {
-    CachedDevCntsBegin = DevCntsBegin;
-    CachedHostCnts = HostCountersBegin;
-    CachedCntsSize = CountersSize;
-    CntsOwner.release();
-  }
-  if (!DataReused && DataSize > 0) {
-    CachedDevDataBegin = DevDataBegin;
-    CachedHostData = HostDataBegin;
-    CachedDataSize = DataSize;
-    DataOwner.release();
-  }
-  if (!NamesReused && NamesSize > 0) {
-    CachedDevNamesBegin = DevNamesBegin;
-    CachedHostNames = HostNamesBegin;
-    CachedNamesSize = NamesSize;
-    NamesOwner.release();
+    if (DataSize > 0 && DevDataBegin == CachedDevDataBegin &&
+        DataSize == CachedDataSize) {
+      HostDataBegin = CachedHostData;
+      DataReused = 1;
+      if (isVerboseMode())
+        PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize);
+    } else if (DataSize > 0) {
+      HostDataBegin = (char *)malloc(DataSize);
+      DataOwner.reset(HostDataBegin);
+    }
+
+    if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin &&
+        NamesSize == CachedNamesSize) {
+      HostNamesBegin = CachedHostNames;
+      NamesReused = 1;
+      if (isVerboseMode())
+        PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize);
+    } else if (NamesSize > 0) {
+      HostNamesBegin = (char *)malloc(NamesSize);
+      NamesOwner.reset(HostNamesBegin);
+    }
+
+    if ((DataSize > 0 && !HostDataBegin) ||
+        (CountersSize > 0 && !HostCountersBegin) ||
+        (NamesSize > 0 && !HostNamesBegin)) {
+      PROF_ERR("%s\n", "failed to allocate host memory for device sections");
+      return -1;
+    }
+
+    if ((DataSize > 0 && !DataReused &&
+         memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) ||
+        (CountersSize > 0 && !CntsReused &&
+         memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) !=
+             0) ||
+        (NamesSize > 0 && !NamesReused &&
+         memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) {
+      PROF_ERR("%s\n", "failed to copy profile sections from device");
+      return -1;
+    }
+
+    /* Cache buffers so RDC-mode multi-shadow drains can reuse them.
+     * release() prevents the scope guards from freeing what the cache owns. */
+    if (!CntsReused && CountersSize > 0) {
+      CachedDevCntsBegin = DevCntsBegin;
+      CachedHostCnts = HostCountersBegin;
+      CachedCntsSize = CountersSize;
+      CntsOwner.release();
+    }
+    if (!DataReused && DataSize > 0) {
+      CachedDevDataBegin = DevDataBegin;
+      CachedHostData = HostDataBegin;
+      CachedDataSize = DataSize;
+      DataOwner.release();
+    }
+    if (!NamesReused && NamesSize > 0) {
+      CachedDevNamesBegin = DevNamesBegin;
+      CachedHostNames = HostNamesBegin;
+      CachedNamesSize = NamesSize;
+      NamesOwner.release();
+    }
   }
 
   if (isVerboseMode())
@@ -721,16 +946,38 @@ static int processDeviceOffloadPrf(void 
*DeviceOffloadPrf, int TUIndex,
   for (uint64_t i = 0; i < NumData; ++i) {
     if (RelocatedData[i].CounterPtr) {
       ptrdiff_t DeviceCounterPtrOffset = 
(ptrdiff_t)RelocatedData[i].CounterPtr;
-      const char *DeviceDataStructAddr =
-          (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data));
+      size_t DataRecordOffset = i * sizeof(__llvm_profile_data);
+      const char *RangeDevDataBegin = (const char *)DevDataBegin;
+      const char *RangeDevCountersBegin = (const char *)DevCntsBegin;
+      size_t RangeCountersOffset = 0;
+      if (UseRegisteredSections) {
+        int FoundRange = 0;
+        for (int R = 0; R < NumRegisteredRanges; ++R) {
+          RegisteredSectionRange *Range = &RegisteredRanges[R];
+          if (DataRecordOffset < Range->DataOffset ||
+              DataRecordOffset >= Range->DataOffset + Range->DataSize)
+            continue;
+          RangeDevDataBegin = (const char *)Range->Data;
+          RangeDevCountersBegin = (const char *)Range->Counters;
+          RangeCountersOffset = Range->CountersOffset;
+          DataRecordOffset -= Range->DataOffset;
+          FoundRange = 1;
+          break;
+        }
+        if (!FoundRange) {
+          PROF_ERR("%s\n", "failed to locate profile data record range");
+          return -1;
+        }
+      }
+      const char *DeviceDataStructAddr = RangeDevDataBegin + DataRecordOffset;
       const char *DeviceCountersAddr =
           DeviceDataStructAddr + DeviceCounterPtrOffset;
       ptrdiff_t OffsetIntoCountersSection =
-          DeviceCountersAddr - (const char *)DevCntsBegin;
+          DeviceCountersAddr - RangeDevCountersBegin;
 
-      ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters +
-                                    OffsetIntoCountersSection -
-                                    (i * sizeof(__llvm_profile_data));
+      ptrdiff_t NewRelativeOffset =
+          DataSize + PaddingBytesBeforeCounters + RangeCountersOffset +
+          OffsetIntoCountersSection - (i * sizeof(__llvm_profile_data));
       __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) 
+
                            offsetof(__llvm_profile_data, CounterPtr),
                        &NewRelativeOffset, sizeof(NewRelativeOffset));
@@ -743,9 +990,6 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, 
int TUIndex,
                          sizeof(RelocatedData[i].Values));
   }
 
-  /* Target already encodes TUIndex when needed. */
-  (void)TUIndex;
-
   ret = __llvm_write_custom_profile(
       Target, (__llvm_profile_data *)BufDataBegin,
       (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin,
@@ -761,8 +1005,8 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, 
int TUIndex,
   return ret;
 }
 
-static int processShadowVariable(void *ShadowVar, int TUIndex,
-                                 const char *Target) {
+static int processShadowVariable(int Index, const char *Target) {
+  void *ShadowVar = OffloadShadowVariables[Index];
   void *DeviceSections = nullptr;
   if (hipGetSymbolAddress(&DeviceSections, ShadowVar) != 0) {
     PROF_WARN("failed to get symbol address for shadow variable %p\n",
@@ -770,7 +1014,12 @@ static int processShadowVariable(void *ShadowVar, int 
TUIndex,
     return -1;
   }
   /* DeviceSections points at the per-TU sections struct itself. */
-  return processDeviceOffloadPrf(DeviceSections, TUIndex, Target);
+  const OffloadSectionShadowGroup *Sections = nullptr;
+  if (Index < CapSectionShadowGroups)
+    Sections = &OffloadSectionShadowGroups[Index];
+  if (!hasCompleteSectionShadows(Sections))
+    return 0;
+  return processDeviceOffloadPrf(DeviceSections, Target, Sections);
 }
 
 static int isHipAvailable(void) {
@@ -815,7 +1064,7 @@ extern "C" int 
__llvm_profile_hip_collect_device_data(void) {
           snprintf(TargetWithIdx, sizeof(TargetWithIdx), "%s.%d", ArchName, i);
           Target = TargetWithIdx;
         }
-        if (processShadowVariable(OffloadShadowVariables[i], i, Target) != 0)
+        if (processShadowVariable(i, Target) != 0)
           Ret = -1;
       }
     }
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h 
b/llvm/include/llvm/ProfileData/InstrProf.h
index 90471e910bb50..0138e310304fa 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -152,6 +152,10 @@ inline StringRef getInstrProfVNodesVarName() { return 
"__llvm_prf_vnodes"; }
 /// of all function's PGO names.
 inline StringRef getInstrProfNamesVarName() { return "__llvm_prf_nm"; }
 
+inline StringRef getInstrProfNamesVarPostfixVarName() {
+  return "__llvm_prf_nm_postfix";
+}
+
 inline StringRef getInstrProfVTableNamesVarName() { return "__llvm_prf_vnm"; }
 
 /// Return the name of a covarage mapping variable (internal linkage)
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp 
b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 8e4ba41919768..6f6ad89126024 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1984,6 +1984,50 @@ void InstrLowerer::emitVNodes() {
   UsedVars.push_back(VNodesVar);
 }
 
+// Build the per-TU device-PGO sections struct: section start/stop bounds for
+// names/counters/data plus the raw version. Returns null if it already exists.
+static GlobalVariable *emitGPUOffloadSectionsStruct(Module &M,
+                                                    StringRef CUIDPostfix) {
+  std::string Name = ("__llvm_profile_sections" + CUIDPostfix).str();
+  if (M.getNamedValue(Name))
+    return nullptr;
+
+  LLVMContext &Ctx = M.getContext();
+  unsigned AS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+  auto Extern = [&](StringRef Sym, Type *Ty, bool IsConst,
+                    GlobalValue::VisibilityTypes Vis) {
+    GlobalVariable *GV = M.getNamedGlobal(Sym);
+    if (!GV) {
+      GV = new GlobalVariable(M, Ty, IsConst, GlobalValue::ExternalLinkage,
+                              nullptr, Sym, nullptr,
+                              GlobalValue::NotThreadLocal, AS);
+      GV->setVisibility(Vis);
+    }
+    return GV;
+  };
+  // Section bounds are hidden i8 markers; raw_version is an i64 constant.
+  auto *I8 = Type::getInt8Ty(Ctx);
+  auto Hidden = GlobalValue::HiddenVisibility;
+  Constant *Fields[] = {Extern("__start___llvm_prf_names", I8, false, Hidden),
+                        Extern("__stop___llvm_prf_names", I8, false, Hidden),
+                        Extern("__start___llvm_prf_cnts", I8, false, Hidden),
+                        Extern("__stop___llvm_prf_cnts", I8, false, Hidden),
+                        Extern("__start___llvm_prf_data", I8, false, Hidden),
+                        Extern("__stop___llvm_prf_data", I8, false, Hidden),
+                        Extern("__llvm_profile_raw_version",
+                               Type::getInt64Ty(Ctx), true,
+                               GlobalValue::DefaultVisibility)};
+  auto *PtrTy = PointerType::get(Ctx, AS);
+  auto *STy =
+      StructType::get(Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
+  auto *GV = new GlobalVariable(M, STy, /*isConstant=*/true,
+                                GlobalValue::ExternalLinkage,
+                                ConstantStruct::get(STy, Fields), Name, 
nullptr,
+                                GlobalValue::NotThreadLocal, AS);
+  GV->setVisibility(GlobalValue::ProtectedVisibility);
+  return GV;
+}
+
 void InstrLowerer::emitNameData() {
   if (ReferencedNames.empty())
     return;
@@ -1998,9 +2042,28 @@ void InstrLowerer::emitNameData() {
   auto *NamesVal =
       ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false);
   std::string NamesVarName = std::string(getInstrProfNamesVarName());
-  NamesVar =
-      new GlobalVariable(M, NamesVal->getType(), true,
-                         GlobalValue::PrivateLinkage, NamesVal, NamesVarName);
+  GlobalValue::LinkageTypes NamesLinkage = GlobalValue::PrivateLinkage;
+  GlobalValue::VisibilityTypes NamesVisibility = 
GlobalValue::DefaultVisibility;
+  std::string GPUCUIDPostfix;
+  if (isGPUProfTarget(M)) {
+    if (auto *GV = M.getNamedGlobal(getInstrProfNamesVarPostfixVarName())) {
+      if (auto *Init =
+              dyn_cast_or_null<ConstantDataArray>(GV->getInitializer())) {
+        if (Init->isCString()) {
+          GPUCUIDPostfix = Init->getAsCString().str();
+          NamesVarName += GPUCUIDPostfix;
+          NamesLinkage = GlobalValue::ExternalLinkage;
+          NamesVisibility = GlobalValue::ProtectedVisibility;
+          removeFromUsedLists(
+              M, [GV](Constant *C) { return C->stripPointerCasts() == GV; });
+          GV->eraseFromParent();
+        }
+      }
+    }
+  }
+  NamesVar = new GlobalVariable(M, NamesVal->getType(), true, NamesLinkage,
+                                NamesVal, NamesVarName);
+  NamesVar->setVisibility(NamesVisibility);
 
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
@@ -2019,6 +2082,14 @@ void InstrLowerer::emitNameData() {
 
   for (auto *NamePtr : ReferencedNames)
     NamePtr->eraseFromParent();
+
+  // Emit the device sections struct only when this TU produced profile data, 
so
+  // its section start/stop references are backed by a real section.
+  bool HasData = llvm::any_of(ProfileDataMap,
+                              [](const auto &KV) { return KV.second.DataVar; 
});
+  if (!GPUCUIDPostfix.empty() && HasData)
+    if (GlobalVariable *GV = emitGPUOffloadSectionsStruct(M, GPUCUIDPostfix))
+      CompilerUsedVars.push_back(GV);
 }
 
 void InstrLowerer::emitVTableNames() {

>From d899577e5c0a1d476978a67a3760f87521a5665d Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <[email protected]>
Date: Tue, 9 Jun 2026 09:52:18 -0400
Subject: [PATCH 2/3] Address HIP PGO review comments

---
 clang/lib/CodeGen/CGCUDANV.cpp | 36 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index f19813db189ef..f8c3810297256 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -38,10 +38,6 @@ namespace {
 constexpr unsigned CudaFatMagic = 0x466243b1;
 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
-static std::string getOffloadProfilingNamesVarName(llvm::StringRef CUIDHash) {
-  return (llvm::Twine(llvm::getInstrProfNamesVarName()) + "_" + 
CUIDHash).str();
-}
-
 class CGNVCUDARuntime : public CGCUDARuntime {
 
   /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
@@ -767,15 +763,17 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
   if (OffloadProfShadow) {
     llvm::Constant *Name =
         makeConstantString(std::string(OffloadProfShadow->getName()));
+    llvm::Constant *IntZero = llvm::ConstantInt::get(IntTy, 0);
     llvm::Value *RegisterVarArgs[] = {
         &GpuBinaryHandlePtr,
         OffloadProfShadow,
         Name,
         Name,
-        llvm::ConstantInt::get(IntTy, /*Extern=*/0),
-        llvm::ConstantInt::get(VarSizeTy, 
CGM.getDataLayout().getPointerSize()),
-        llvm::ConstantInt::get(IntTy, /*Constant=*/0),
-        llvm::ConstantInt::get(IntTy, 0)};
+        IntZero,
+        llvm::ConstantInt::get(VarSizeTy,
+                               CGM.getDataLayout().getPointerSize(/*AS=*/0)),
+        IntZero,
+        IntZero};
     Builder.CreateCall(RegisterVar, RegisterVarArgs);
 
     llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
@@ -788,6 +786,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(VoidTy, {PtrTy}, false),
         "__llvm_profile_offload_register_section_shadow_variable");
+    llvm::Constant *IntZero = llvm::ConstantInt::get(IntTy, 0);
     for (const auto &Info : OffloadProfSectionShadows) {
       llvm::Constant *Name = makeConstantString(Info.DeviceName);
       llvm::Value *RegisterVarArgs[] = {
@@ -795,11 +794,11 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
           Info.Shadow,
           Name,
           Name,
-          llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+          IntZero,
           llvm::ConstantInt::get(VarSizeTy,
-                                 CGM.getDataLayout().getPointerSize()),
-          llvm::ConstantInt::get(IntTy, /*Constant=*/0),
-          llvm::ConstantInt::get(IntTy, 0)};
+                                 CGM.getDataLayout().getPointerSize(/*AS=*/0)),
+          IntZero,
+          IntZero};
       Builder.CreateCall(RegisterVar, RegisterVarArgs);
       Builder.CreateCall(RegisterSectionShadow, {Info.Shadow});
     }
@@ -1334,7 +1333,7 @@ void CGNVCUDARuntime::createOffloadingEntries() {
   if (OffloadProfShadow) {
     llvm::offloading::emitOffloadingEntry(
         M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
-        CGM.getDataLayout().getPointerSize(),
+        CGM.getDataLayout().getPointerSize(/*AS=*/0),
         llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
 
     llvm::LLVMContext &Ctx = M.getContext();
@@ -1355,7 +1354,7 @@ void CGNVCUDARuntime::createOffloadingEntries() {
     for (const auto &Info : OffloadProfSectionShadows) {
       llvm::offloading::emitOffloadingEntry(
           M, Kind, Info.Shadow, Info.DeviceName,
-          CGM.getDataLayout().getPointerSize(),
+          CGM.getDataLayout().getPointerSize(/*AS=*/0),
           llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
       B.CreateCall(RegisterSectionShadow, {Info.Shadow});
     }
@@ -1422,7 +1421,7 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() {
       llvm::ConstantPointerNull::get(PtrTy), Name);
   CGM.addCompilerUsedGlobal(OffloadProfShadow);
 
-  auto AddSectionShadow = [&](StringRef Kind, StringRef DeviceName) {
+  auto AddSectionShadow = [&](StringRef Kind, const Twine &DeviceName) {
     std::string ShadowName =
         (Twine("__llvm_profile_shadow_") + Kind + "_" + CUIDHash + "_" +
          Twine(OffloadProfSectionShadows.size()))
@@ -1436,9 +1435,10 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() {
 
   for (auto &&I : EmittedKernels) {
     std::string KernelName = getDeviceSideName(cast<NamedDecl>(I.D));
-    AddSectionShadow("data", std::string("__profd_") + KernelName);
-    AddSectionShadow("cnts", std::string("__profc_") + KernelName);
-    AddSectionShadow("names", getOffloadProfilingNamesVarName(CUIDHash));
+    AddSectionShadow("data", Twine("__profd_") + KernelName);
+    AddSectionShadow("cnts", Twine("__profc_") + KernelName);
+    AddSectionShadow("names",
+                     Twine(llvm::getInstrProfNamesVarName()) + "_" + CUIDHash);
   }
 }
 

>From 033db5c3559727f04401b94a1fbfef324e5b3ed4 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <[email protected]>
Date: Wed, 10 Jun 2026 14:23:34 -0400
Subject: [PATCH 3/3] Track used HIP devices

---
 .../profile/InstrProfilingPlatformROCm.cpp    | 102 +++++++++++++++---
 1 file changed, 90 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp 
b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
index e972f22fdae8f..d06632a91a69f 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp
@@ -101,6 +101,10 @@ static hipGetDevicePropertiesTy pHipGetDeviceProperties = 
nullptr;
 static int NumDevices = 0;
 /* 256 matches hipDeviceProp_t::gcnArchName, the source field width. */
 static char (*DeviceArchNames)[256] = nullptr;
+#if defined(__linux__) && !defined(_WIN32)
+static unsigned char *UsedDevices = nullptr;
+static int AnyDeviceUsed = 0;
+#endif
 
 /* -------------------------------------------------------------------------- 
*/
 /*  Device-to-host copies                                                     
*/
@@ -160,6 +164,11 @@ static void doEnsureHipLoaded(void) {
         PROF_ERR("%s\n", "failed to allocate device arch name table");
         return;
       }
+#if defined(__linux__) && !defined(_WIN32)
+      UsedDevices = (unsigned char *)calloc(Count, sizeof(*UsedDevices));
+      if (!UsedDevices && isVerboseMode())
+        PROF_NOTE("%s\n", "Device-use tracking disabled");
+#endif
       HipDevicePropMinimal Prop;
       for (int i = 0; i < Count; ++i) {
         __builtin_memset(&Prop, 0, sizeof(Prop));
@@ -234,6 +243,26 @@ static int hipSetDevice(int DeviceId) {
   return pHipSetDevice ? pHipSetDevice(DeviceId) : -1;
 }
 
+#if defined(__linux__) && !defined(_WIN32)
+static void markCurrentDeviceUsed(void) {
+  int DeviceId = -1;
+  if (hipGetDevice(&DeviceId) != 0 || DeviceId < 0 || DeviceId >= NumDevices ||
+      !UsedDevices)
+    return;
+  __atomic_store_n(&UsedDevices[DeviceId], 1, __ATOMIC_RELAXED);
+  __atomic_store_n(&AnyDeviceUsed, 1, __ATOMIC_RELEASE);
+}
+
+static int shouldCollectDevice(int DeviceId) {
+  if (UsedDevices && __atomic_load_n(&AnyDeviceUsed, __ATOMIC_ACQUIRE) &&
+      !__atomic_load_n(&UsedDevices[DeviceId], __ATOMIC_RELAXED))
+    return 0;
+  return 1;
+}
+#else
+static int shouldCollectDevice(int) { return 1; }
+#endif
+
 static const char *getDeviceArchName(int DeviceId) {
   if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0])
     return "amdgpu";
@@ -1046,6 +1075,11 @@ extern "C" int 
__llvm_profile_hip_collect_device_data(void) {
     hipGetDevice(&OrigDevice);
 
     for (int Dev = 0; Dev < NumDevices; ++Dev) {
+      if (!shouldCollectDevice(Dev)) {
+        if (isVerboseMode())
+          PROF_NOTE("Skipping unused device %d\n", Dev);
+        continue;
+      }
       if (hipSetDevice(Dev) != 0) {
         if (isVerboseMode())
           PROF_NOTE("Failed to set device %d, skipping\n", Dev);
@@ -1094,10 +1128,41 @@ extern "C" int 
__llvm_profile_hip_collect_device_data(void) {
   return Ret;
 }
 
-/* Interceptors for hipModuleLoad* / hipModuleUnload. Linux only. */
+/* Linux HIP interceptors. */
 
 #if defined(__linux__) && !defined(_WIN32)
 
+typedef struct {
+  unsigned int x;
+  unsigned int y;
+  unsigned int z;
+} HipDim3;
+
+typedef void *HipFunction;
+typedef void *HipStream;
+
+static int recordHipLaunchResult(int Rc) {
+  if (Rc == 0)
+    markCurrentDeviceUsed();
+  return Rc;
+}
+
+INTERCEPTOR(int, hipLaunchKernel, const void *Function, HipDim3 GridDim,
+            HipDim3 BlockDim, void **Args, size_t SharedMemBytes,
+            HipStream Stream) {
+  return recordHipLaunchResult(REAL(hipLaunchKernel)(
+      Function, GridDim, BlockDim, Args, SharedMemBytes, Stream));
+}
+
+INTERCEPTOR(int, hipModuleLaunchKernel, HipFunction Function, unsigned 
GridDimX,
+            unsigned GridDimY, unsigned GridDimZ, unsigned BlockDimX,
+            unsigned BlockDimY, unsigned BlockDimZ, unsigned SharedMemBytes,
+            HipStream Stream, void **KernelParams, void **Extra) {
+  return recordHipLaunchResult(REAL(hipModuleLaunchKernel)(
+      Function, GridDimX, GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
+      SharedMemBytes, Stream, KernelParams, Extra));
+}
+
 INTERCEPTOR(int, hipModuleLoad, void **module, const char *fname) {
   int rc = REAL(hipModuleLoad)(module, fname);
   /* Pass NULL image: no in-memory ELF is available for filename loads,
@@ -1127,20 +1192,33 @@ INTERCEPTOR(int, hipModuleUnload, void *module) {
   return REAL(hipModuleUnload)(module);
 }
 
-__attribute__((constructor)) static void installHipModuleInterceptors() {
-  /* Skip when the HIP runtime is not loaded. INTERCEPT_FUNCTION uses the
-   * sanitizer interception framework, which can perturb dlsym/PLT state for
-   * the rest of the process even when the target symbol is absent; non-HIP
-   * programs linked with libclang_rt.profile.a must see zero side effects. */
-  if (!dlsym(RTLD_DEFAULT, "hipModuleLoad"))
+__attribute__((constructor)) static void installHipInterceptors() {
+  /* Avoid interception unless the HIP runtime is already loaded. */
+  int HasLaunchKernel = dlsym(RTLD_DEFAULT, "hipLaunchKernel") != nullptr;
+  int HasModuleLaunchKernel =
+      dlsym(RTLD_DEFAULT, "hipModuleLaunchKernel") != nullptr;
+  int HasModuleLoad = dlsym(RTLD_DEFAULT, "hipModuleLoad") != nullptr;
+  if (!HasLaunchKernel && !HasModuleLaunchKernel && !HasModuleLoad)
     return;
-  if (!INTERCEPT_FUNCTION(hipModuleLoad))
+  int InstalledLaunch = 0;
+  if (HasLaunchKernel)
+    InstalledLaunch |= INTERCEPT_FUNCTION(hipLaunchKernel);
+  if (HasModuleLaunchKernel)
+    InstalledLaunch |= INTERCEPT_FUNCTION(hipModuleLaunchKernel);
+  int InstalledAny = InstalledLaunch;
+  if (HasModuleLoad) {
+    HasModuleLoad = INTERCEPT_FUNCTION(hipModuleLoad);
+    InstalledAny |= HasModuleLoad;
+  }
+  if (!InstalledAny)
     return;
   if (isVerboseMode())
-    PROF_NOTE("%s", "Installing hipModuleLoad*/hipModuleUnload 
interceptors\n");
-  INTERCEPT_FUNCTION(hipModuleLoadData);
-  INTERCEPT_FUNCTION(hipModuleLoadDataEx);
-  INTERCEPT_FUNCTION(hipModuleUnload);
+    PROF_NOTE("%s", "Installing HIP interceptors\n");
+  if (HasModuleLoad) {
+    INTERCEPT_FUNCTION(hipModuleLoadData);
+    INTERCEPT_FUNCTION(hipModuleLoadDataEx);
+    INTERCEPT_FUNCTION(hipModuleUnload);
+  }
 }
 
 #endif /* __linux__ */

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to