llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang-codegen

Author: Yaxun (Sam) Liu (yxsamliu)

<details>
<summary>Changes</summary>

On Windows, HIP rejects a hipMemcpy from a device symbol when the copy goes
past the bounds of the symbol registered with __hipRegisterVar. The device
profile data, counters, and names live in merged linker sections, so copying a
whole section range spills past the one registered symbol and fails. As the
main change, register a separate shadow for each device data, counters, and
names symbol and copy each one using its exact size from hipGetSymbolSize. This
also lets static TUs with several kernels keep all their profile data instead of
dropping later kernels.

As minor fixes, open the device profile file in binary mode and pass the device
names to the correct names arguments of lprofWriteDataImpl so llvm-profdata can
read the raw profile, and give each device TU a unique profile names symbol so
RDC builds with multiple TUs do not clash.

Open the versioned amdhip64_7.dll first, falling back to amdhip64.dll, so the
latest HIP runtime is used.

Depends on #<!-- -->201607 (reland HIP offload PGO compiler support and link the
device-profile runtime); that PR must land first.



---

Patch is 63.42 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/202095.diff


15 Files Affected:

- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+226) 
- (modified) clang/lib/Driver/ToolChains/Linux.cpp (+7) 
- (modified) clang/lib/Driver/ToolChains/MSVC.cpp (+7) 
- (added) clang/test/CodeGenHIP/offload-pgo-sections.hip (+71) 
- (added) clang/test/Driver/hip-profile-rocm-runtime.hip (+31) 
- (modified) compiler-rt/lib/profile/InstrProfilingFile.c (+5-6) 
- (modified) compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp (+341-92) 
- (modified) llvm/include/llvm/IR/RuntimeLibcalls.td (+7-2) 
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+9) 
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+100-26) 
- (modified) llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp (+2-1) 
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll 
(+32) 
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll (+26) 
- (added) llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll (+36) 
- (added) llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll 
(+22) 


``````````diff
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 259b6c040706b..877ba59607f51 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -26,8 +26,10 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/ReplaceConstant.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -36,6 +38,10 @@ namespace {
 constexpr unsigned CudaFatMagic = 0x466243b1;
 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
+static std::string getOffloadProfilingNamesVarName(llvm::StringRef CUIDHash) {
+  return (llvm::Twine(llvm::getInstrProfNamesVarName()) + "_" + 
CUIDHash).str();
+}
+
 class CGNVCUDARuntime : public CGCUDARuntime {
 
   /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
@@ -72,6 +78,16 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
+  /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
+  /// emitted only for HIP host compiles when PGO is on. Registered via
+  /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
+  /// can locate the device-side table by name.
+  llvm::GlobalVariable *OffloadProfShadow = nullptr;
+  struct OffloadProfSectionShadowInfo {
+    llvm::GlobalVariable *Shadow;
+    std::string DeviceName;
+  };
+  llvm::SmallVector<OffloadProfSectionShadowInfo, 16> 
OffloadProfSectionShadows;
   /// Whether we generate relocatable device code.
   bool RelocatableDeviceCode;
   /// Mangle context for device.
@@ -176,6 +192,13 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   void transformManagedVars();
   /// Create offloading entries to register globals in RDC mode.
   void createOffloadingEntries();
+  /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
+  /// On the device side it is the populated 7-pointer section-bounds table.
+  /// On the host side it is a placeholder void* shadow stored in
+  /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
+  /// or createOffloadingEntries (RDC) so the runtime can locate the
+  /// device-side table by name.
+  void emitOffloadProfilingSections();
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
@@ -735,6 +758,53 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     }
   }
 
+  // Register the per-TU offload-profiling shadow so the host runtime can
+  // locate the matching device-side __llvm_profile_sections_<CUID>. We
+  // emit both __hipRegisterVar (so the HIP runtime can map the host
+  // shadow to the device symbol) and
+  // __llvm_profile_offload_register_shadow_variable (so the profile
+  // runtime adds the shadow to its drain list).
+  if (OffloadProfShadow) {
+    llvm::Constant *Name =
+        makeConstantString(std::string(OffloadProfShadow->getName()));
+    llvm::Value *RegisterVarArgs[] = {
+        &GpuBinaryHandlePtr,
+        OffloadProfShadow,
+        Name,
+        Name,
+        llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+        llvm::ConstantInt::get(VarSizeTy, 
CGM.getDataLayout().getPointerSize()),
+        llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+        llvm::ConstantInt::get(IntTy, 0)};
+    Builder.CreateCall(RegisterVar, RegisterVarArgs);
+
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
+  }
+
+  if (!OffloadProfSectionShadows.empty()) {
+    llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_section_shadow_variable");
+    for (const auto &Info : OffloadProfSectionShadows) {
+      llvm::Constant *Name = makeConstantString(Info.DeviceName);
+      llvm::Value *RegisterVarArgs[] = {
+          &GpuBinaryHandlePtr,
+          Info.Shadow,
+          Name,
+          Name,
+          llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+          llvm::ConstantInt::get(VarSizeTy,
+                                 CGM.getDataLayout().getPointerSize()),
+          llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+          llvm::ConstantInt::get(IntTy, 0)};
+      Builder.CreateCall(RegisterVar, RegisterVarArgs);
+      Builder.CreateCall(RegisterSectionShadow, {Info.Shadow});
+    }
+  }
+
   Builder.CreateRetVoid();
   return RegisterKernelsFunc;
 }
@@ -1256,11 +1326,167 @@ void CGNVCUDARuntime::createOffloadingEntries() {
           I.Flags.getSurfTexType());
     }
   }
+
+  // Register the per-TU offload-profiling shadow. The offloading entry
+  // makes the linker-wrapper emit the host __hipRegisterVar call in the
+  // combined ctor. Separately emit a per-TU ctor that registers the
+  // shadow with the profile runtime's drain list.
+  if (OffloadProfShadow) {
+    llvm::offloading::emitOffloadingEntry(
+        M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
+        CGM.getDataLayout().getPointerSize(),
+        llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+
+    llvm::LLVMContext &Ctx = M.getContext();
+    auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_section_shadow_variable");
+    auto *CtorFn = llvm::Function::Create(
+        llvm::FunctionType::get(VoidTy, false),
+        llvm::GlobalValue::InternalLinkage,
+        "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), 
&M);
+    auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
+    llvm::IRBuilder<> B(Entry);
+    B.CreateCall(RegisterShadow, {OffloadProfShadow});
+    for (const auto &Info : OffloadProfSectionShadows) {
+      llvm::offloading::emitOffloadingEntry(
+          M, Kind, Info.Shadow, Info.DeviceName,
+          CGM.getDataLayout().getPointerSize(),
+          llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+      B.CreateCall(RegisterSectionShadow, {Info.Shadow});
+    }
+    B.CreateRetVoid();
+    llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
+  }
+}
+
+// For HIP host+device compiles with PGO enabled, emit the per-TU global
+// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding
+// section start/stop bounds for the names/counters/data sections plus the
+// raw-version variable. Host side: an opaque void* shadow whose only
+// purpose is to give the host-runtime a registered symbol name to look up
+// via hipGetSymbolAddress; the actual device-side data lives in the
+// matching device-side global.
+void CGNVCUDARuntime::emitOffloadProfilingSections() {
+  if (!CGM.getLangOpts().HIP)
+    return;
+  if (!CGM.getCodeGenOpts().hasProfileInstr())
+    return;
+
+  StringRef CUIDHash = CGM.getContext().getCUIDHash();
+  if (CUIDHash.empty())
+    return;
+
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = M.getContext();
+  std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
+
+  // If the global already exists (e.g. another TU was merged in), don't
+  // duplicate it.
+  if (M.getNamedValue(Name))
+    return;
+
+  if (CGM.getLangOpts().CUDAIsDevice) {
+    // Device side: emit the populated struct. Section start/stop symbols
+    // are linker-defined (ELF auto-generates __start_/__stop_ for any
+    // section whose name is a valid C identifier; AMDGPU is ELF).
+    unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+    auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
+    auto getOrDeclare = [&](StringRef SymName) {
+      if (auto *GV = M.getNamedGlobal(SymName))
+        return GV;
+      auto *GV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+      GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+      return GV;
+    };
+    auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
+    if (!VersionGV) {
+      VersionGV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
+          "__llvm_profile_raw_version",
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+    }
+
+    std::string NamesVarPostfixVarName =
+        std::string(llvm::getInstrProfNamesVarPostfixVarName());
+    if (!M.getNamedValue(NamesVarPostfixVarName)) {
+      auto *NamesVarPostfix = llvm::ConstantDataArray::getString(
+          Ctx, (llvm::Twine("_") + CUIDHash).str(), true);
+      auto *NamesGV = new llvm::GlobalVariable(
+          M, NamesVarPostfix->getType(), /*isConstant=*/true,
+          llvm::GlobalValue::PrivateLinkage, NamesVarPostfix,
+          NamesVarPostfixVarName,
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+      CGM.addCompilerUsedGlobal(NamesGV);
+    }
+
+    auto *StructTy = llvm::StructType::get(
+        Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
+    llvm::Constant *Fields[] = {
+        getOrDeclare("__start___llvm_prf_names"),
+        getOrDeclare("__stop___llvm_prf_names"),
+        getOrDeclare("__start___llvm_prf_cnts"),
+        getOrDeclare("__stop___llvm_prf_cnts"),
+        getOrDeclare("__start___llvm_prf_data"),
+        getOrDeclare("__stop___llvm_prf_data"),
+        VersionGV,
+    };
+    auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
+    auto *GV = new llvm::GlobalVariable(
+        M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+        Init, Name, /*InsertBefore=*/nullptr, 
llvm::GlobalValue::NotThreadLocal,
+        GlobalAS);
+    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
+    CGM.addCompilerUsedGlobal(GV);
+    return;
+  }
+
+  // Host side: emit an opaque void* shadow. Layout doesn't matter — the
+  // runtime locates it by name via hipGetSymbolAddress and treats it as
+  // the address of the device-side struct. Registration with the HIP
+  // runtime is added by makeRegisterGlobalsFn (non-RDC) or
+  // createOffloadingEntries (RDC).
+  auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+  OffloadProfShadow = new llvm::GlobalVariable(
+      M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+      llvm::ConstantPointerNull::get(PtrTy), Name);
+  CGM.addCompilerUsedGlobal(OffloadProfShadow);
+
+  auto AddSectionShadow = [&](StringRef Kind, StringRef DeviceName) {
+    std::string ShadowName =
+        (Twine("__llvm_profile_shadow_") + Kind + "_" + CUIDHash + "_" +
+         Twine(OffloadProfSectionShadows.size()))
+            .str();
+    auto *Shadow = new llvm::GlobalVariable(
+        M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+        llvm::ConstantPointerNull::get(PtrTy), ShadowName);
+    CGM.addCompilerUsedGlobal(Shadow);
+    OffloadProfSectionShadows.push_back({Shadow, DeviceName.str()});
+  };
+
+  for (auto &&I : EmittedKernels) {
+    std::string KernelName = getDeviceSideName(cast<NamedDecl>(I.D));
+    AddSectionShadow("data", std::string("__profd_") + KernelName);
+    AddSectionShadow("cnts", std::string("__profc_") + KernelName);
+    AddSectionShadow("names", getOffloadProfilingNamesVarName(CUIDHash));
+  }
 }
 
 // Returns module constructor to be added.
 llvm::Function *CGNVCUDARuntime::finalizeModule() {
   transformManagedVars();
+  emitOffloadProfilingSections();
   if (CGM.getLangOpts().CUDAIsDevice) {
     // Mark ODR-used device variables as compiler used to prevent it from being
     // eliminated by optimization. This is necessary for device variables
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp 
b/clang/lib/Driver/ToolChains/Linux.cpp
index 5f04afe34c554..d3c94c8addffa 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -902,6 +902,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const 
ArgList &Args,
   if (ActiveKinds & Action::OFK_HIP)
     CmdArgs.push_back(
         Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath()));
+
+  // For HIP device PGO, link clang_rt.profile_rocm when available. It is a
+  // self-contained superset of clang_rt.profile, emitted first so the base
+  // archive stays inert.
+  if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) &&
+      getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+    CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
 }
 
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp 
b/clang/lib/Driver/ToolChains/MSVC.cpp
index 6bc58699fb007..8141f9f132421 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -592,6 +592,13 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, 
const ArgList &Args,
     CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") +
                                        RocmInstallation->getLibPath()),
                     "amdhip64.lib"});
+
+    // For HIP device PGO, link clang_rt.profile_rocm when available. It is a
+    // self-contained superset of clang_rt.profile, emitted first so the base
+    // archive stays inert (avoiding a /MD-vs-/MT CRT mix in the host image).
+    if (needsProfileRT(Args) &&
+        getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+      CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
   }
 }
 
diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip 
b/clang/test/CodeGenHIP/offload-pgo-sections.hip
new file mode 100644
index 0000000000000..8a6a6b2790d30
--- /dev/null
+++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip
@@ -0,0 +1,71 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: x86-registered-target
+
+// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global
+// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct
+// in addrspace(1). Host compile: void* shadow registered with the HIP
+// runtime and with the profile runtime's drain list.
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=HOST %s
+//
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fgpu-rdc --offload-new-driver \
+// RUN:   -cuid=abc -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=HOST-RDC %s
+
+// Guard: no PGO -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+// Guard: no CUID -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ int helper(int x) { return x + 1; }
+__global__ void kernel(int *p) { *p = helper(*p); }
+
+// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64
+// DEV-DAG: @__llvm_prf_nm_[[CUID:[0-9a-f]+]] = protected addrspace(1) 
constant {{.*}}section "__llvm_prf_names"
+// DEV-DAG: @__llvm_profile_sections_[[CUID]] = protected addrspace(1) 
constant 
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
+// DEV-DAG: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+
+// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST-DAG: @__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}} = global ptr null
+// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+// HOST: define internal void @__hip_register_globals
+// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]],
+// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+// HOST: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}})
+
+// HOST-RDC: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST-RDC-DAG: @__llvm_profile_shadow_data_[[CUID]]_0 = global ptr null
+// HOST-RDC-DAG: @__llvm_profile_shadow_cnts_[[CUID]]_1 = global ptr null
+// HOST-RDC-DAG: @__llvm_profile_shadow_names_[[CUID]]_2 = global ptr null
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_sections_[[CUID]]
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_data_[[CUID]]_0
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_cnts_[[CUID]]_1
+// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant 
%struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_names_[[CUID]]_2
+// HOST-RDC: define internal void @__llvm_profile_register_shadow.[[CUID]]()
+// HOST-RDC: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+// HOST-RDC-DAG: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_data_[[CUID]]_0)
+// HOST-RDC-DAG: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_cnts_[[CUID]]_1)
+// HOST-RDC-DAG: call void 
@__llvm_profile_offload_register_section_shadow_variable(ptr 
@__llvm_profile_shadow_names_[[CUID]]_2)
+
+// NONE-NOT: __llvm_profile_sections_
+// NONE-NOT: __llvm_profile_offload_register_shadow_variable
diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip 
b/clang/test/Driver/hip-profile-rocm-runtime.hip
new file mode 100644
index 0000000000000..5e99d3f4f2fee
--- /dev/null
+++ b/clang/test/Driver/hip-profile-rocm-runtime.hip
@@ -0,0 +1,31 @@
+// REQUIRES: x86-registered-target, amdgpu-registered-target
+// UNSUPPORTED: system-windows
+
+// Build a fake resource dir containing both the base profile runtime and the
+// ROCm device-profile runtime so the driver's existence check passes.
+// RUN: rm -rf %t && mkdir -p %t/lib/x86_64-unknown-linux
+// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile.a
+// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile_rocm.a
+// RUN: touch %t.o
+
+// HIP host link with PGO links clang_rt.profile_rocm.
+// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \
+// RUN:   -fprofile-instr-generate -resource-dir=%t \
+// RUN:   --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=HIP-PGO %s
+// profile_rocm must precede the base profile so the base archive stays inert.
+// HIP-PGO: "{{.*}}libclang_rt.profile_rocm.a"
+// HIP-PGO: "{{.*}}libclang_rt.profile.a"
+
+// Without profiling, the ROCm device-profile runtime is not linked.
+// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \
+// RUN:...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/202095
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to