llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Yaxun (Sam) Liu (yxsamliu)

<details>
<summary>Changes</summary>

This relands the compiler portion of #<!-- -->177665 ("[PGO][AMDGPU] Add basic 
HIP offload PGO support"), which was reverted in #<!-- -->201416, and adds the 
driver wiring needed for the new separate device-profile runtime library.

The first commit restores the reverted compiler state verbatim: the AMDGPU 
instrumentation lowering in LLVM (InstrProfiling.cpp and 
PGOInstrumentation.cpp), the HIP device-variable and shadow registration in 
Clang codegen (CGCUDANV.cpp), the supporting InstrProf.h and RuntimeLibcalls.td 
changes, and their tests. This part depends on the runtime ABI provided by the 
compiler-rt change, which is relanded separately.

The second commit adds the driver wiring to link the ROCm device-profile 
collection runtime. When building HIP with device PGO (-fprofile-generate or 
-fprofile-instr-generate) the host program must drain device-side profile 
counters. That collection runtime now lives in the separate 
clang_rt.profile_rocm library, so the driver must link it into the host image 
in addition to the base clang_rt.profile. This is done in addOffloadRTLibs, 
which already knows the active offload kinds and links libamdhip64, for both 
the Linux/GNU and MSVC host toolchains. It is gated on the HIP offload kind, 
needsProfileRT, and the library being present in the resource directory. 
Non-HIP profile links and HIP links without profiling are unaffected.

This change depends on the compiler-rt change that adds clang_rt.profile_rocm.


---

Patch is 33.50 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/201607.diff


13 Files Affected:

- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+152) 
- (modified) clang/lib/Driver/ToolChains/Linux.cpp (+7) 
- (modified) clang/lib/Driver/ToolChains/MSVC.cpp (+8) 
- (added) clang/test/CodeGenHIP/offload-pgo-sections.hip (+50) 
- (added) clang/test/Driver/hip-profile-rocm-runtime.hip (+32) 
- (modified) llvm/include/llvm/IR/RuntimeLibcalls.td (+7-2) 
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+5) 
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+83-26) 
- (modified) llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp (+2-1) 
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll 
(+32) 
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll (+26) 
- (added) llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll (+36) 
- (added) llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll 
(+22) 


``````````diff
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 259b6c040706b..65f398af7902b 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
+  /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
+  /// emitted only for HIP host compiles when PGO is on. Registered via
+  /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
+  /// can locate the device-side table by name.
+  llvm::GlobalVariable *OffloadProfShadow = nullptr;
   /// Whether we generate relocatable device code.
   bool RelocatableDeviceCode;
   /// Mangle context for device.
@@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   void transformManagedVars();
   /// Create offloading entries to register globals in RDC mode.
   void createOffloadingEntries();
+  /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
+  /// On the device side it is the populated 7-pointer section-bounds table.
+  /// On the host side it is a placeholder void* shadow stored in
+  /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
+  /// or createOffloadingEntries (RDC) so the runtime can locate the
+  /// device-side table by name.
+  void emitOffloadProfilingSections();
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
@@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     }
   }
 
+  // Register the per-TU offload-profiling shadow so the host runtime can
+  // locate the matching device-side __llvm_profile_sections_<CUID>. We
+  // emit both __hipRegisterVar (so the HIP runtime can map the host
+  // shadow to the device symbol) and
+  // __llvm_profile_offload_register_shadow_variable (so the profile
+  // runtime adds the shadow to its drain list).
+  if (OffloadProfShadow) {
+    llvm::Constant *Name =
+        makeConstantString(std::string(OffloadProfShadow->getName()));
+    llvm::Value *RegisterVarArgs[] = {
+        &GpuBinaryHandlePtr,
+        OffloadProfShadow,
+        Name,
+        Name,
+        llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+        llvm::ConstantInt::get(VarSizeTy, 
CGM.getDataLayout().getPointerSize()),
+        llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+        llvm::ConstantInt::get(IntTy, 0)};
+    Builder.CreateCall(RegisterVar, RegisterVarArgs);
+
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
+  }
+
   Builder.CreateRetVoid();
   return RegisterKernelsFunc;
 }
@@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() {
           I.Flags.getSurfTexType());
     }
   }
+
+  // Register the per-TU offload-profiling shadow. The offloading entry
+  // makes the linker-wrapper emit the host __hipRegisterVar call in the
+  // combined ctor. Separately emit a per-TU ctor that registers the
+  // shadow with the profile runtime's drain list.
+  if (OffloadProfShadow) {
+    llvm::offloading::emitOffloadingEntry(
+        M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
+        CGM.getDataLayout().getPointerSize(),
+        llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+
+    llvm::LLVMContext &Ctx = M.getContext();
+    auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+    llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+        "__llvm_profile_offload_register_shadow_variable");
+    auto *CtorFn = llvm::Function::Create(
+        llvm::FunctionType::get(VoidTy, false),
+        llvm::GlobalValue::InternalLinkage,
+        "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), 
&M);
+    auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
+    llvm::IRBuilder<> B(Entry);
+    B.CreateCall(RegisterShadow, {OffloadProfShadow});
+    B.CreateRetVoid();
+    llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
+  }
+}
+
+// For HIP host+device compiles with PGO enabled, emit the per-TU global
+// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding
+// section start/stop bounds for the names/counters/data sections plus the
+// raw-version variable. Host side: an opaque void* shadow whose only
+// purpose is to give the host-runtime a registered symbol name to look up
+// via hipGetSymbolAddress; the actual device-side data lives in the
+// matching device-side global.
+void CGNVCUDARuntime::emitOffloadProfilingSections() {
+  if (!CGM.getLangOpts().HIP)
+    return;
+  if (!CGM.getCodeGenOpts().hasProfileInstr())
+    return;
+
+  StringRef CUIDHash = CGM.getContext().getCUIDHash();
+  if (CUIDHash.empty())
+    return;
+
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = M.getContext();
+  std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
+
+  // If the global already exists (e.g. another TU was merged in), don't
+  // duplicate it.
+  if (M.getNamedValue(Name))
+    return;
+
+  if (CGM.getLangOpts().CUDAIsDevice) {
+    // Device side: emit the populated struct. Section start/stop symbols
+    // are linker-defined (ELF auto-generates __start_/__stop_ for any
+    // section whose name is a valid C identifier; AMDGPU is ELF).
+    unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+    auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
+    auto getOrDeclare = [&](StringRef SymName) {
+      if (auto *GV = M.getNamedGlobal(SymName))
+        return GV;
+      auto *GV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+      GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+      return GV;
+    };
+    auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
+    if (!VersionGV) {
+      VersionGV = new llvm::GlobalVariable(
+          M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
+          llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
+          "__llvm_profile_raw_version",
+          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+          GlobalAS);
+    }
+
+    auto *StructTy = llvm::StructType::get(
+        Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
+    llvm::Constant *Fields[] = {
+        getOrDeclare("__start___llvm_prf_names"),
+        getOrDeclare("__stop___llvm_prf_names"),
+        getOrDeclare("__start___llvm_prf_cnts"),
+        getOrDeclare("__stop___llvm_prf_cnts"),
+        getOrDeclare("__start___llvm_prf_data"),
+        getOrDeclare("__stop___llvm_prf_data"),
+        VersionGV,
+    };
+    auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
+    auto *GV = new llvm::GlobalVariable(
+        M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+        Init, Name, /*InsertBefore=*/nullptr, 
llvm::GlobalValue::NotThreadLocal,
+        GlobalAS);
+    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
+    CGM.addCompilerUsedGlobal(GV);
+    return;
+  }
+
+  // Host side: emit an opaque void* shadow. Layout doesn't matter — the
+  // runtime locates it by name via hipGetSymbolAddress and treats it as
+  // the address of the device-side struct. Registration with the HIP
+  // runtime is added by makeRegisterGlobalsFn (non-RDC) or
+  // createOffloadingEntries (RDC).
+  auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+  OffloadProfShadow = new llvm::GlobalVariable(
+      M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+      llvm::ConstantPointerNull::get(PtrTy), Name);
+  CGM.addCompilerUsedGlobal(OffloadProfShadow);
 }
 
 // Returns module constructor to be added.
 llvm::Function *CGNVCUDARuntime::finalizeModule() {
   transformManagedVars();
+  emitOffloadProfilingSections();
   if (CGM.getLangOpts().CUDAIsDevice) {
     // Mark ODR-used device variables as compiler used to prevent it from being
     // eliminated by optimization. This is necessary for device variables
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp 
b/clang/lib/Driver/ToolChains/Linux.cpp
index 5f04afe34c554..227e5e2811dac 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -902,6 +902,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const 
ArgList &Args,
   if (ActiveKinds & Action::OFK_HIP)
     CmdArgs.push_back(
         Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath()));
+
+  // For HIP with device PGO, link the host-side ROCm device-profile collection
+  // runtime (clang_rt.profile_rocm) when it is available. It registers a
+  // collector with the base profile runtime to drain device counters.
+  if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) &&
+      getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+    CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
 }
 
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp 
b/clang/lib/Driver/ToolChains/MSVC.cpp
index 6bc58699fb007..7829831f7d103 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -592,6 +592,14 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, 
const ArgList &Args,
     CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") +
                                        RocmInstallation->getLibPath()),
                     "amdhip64.lib"});
+
+    // For HIP with device PGO, link the host-side ROCm device-profile
+    // collection runtime (clang_rt.profile_rocm) when it is available. It
+    // registers a collector with the base profile runtime to drain device
+    // counters.
+    if (needsProfileRT(Args) &&
+        getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+      CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
   }
 }
 
diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip 
b/clang/test/CodeGenHIP/offload-pgo-sections.hip
new file mode 100644
index 0000000000000..17c6fe7b9e609
--- /dev/null
+++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip
@@ -0,0 +1,50 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: x86-registered-target
+
+// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global
+// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct
+// in addrspace(1). Host compile: void* shadow registered with the HIP
+// runtime and with the profile runtime's drain list.
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=HOST %s
+
+// Guard: no PGO -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+// Guard: no CUID -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefix=NONE %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ int helper(int x) { return x + 1; }
+__global__ void kernel(int *p) { *p = helper(*p); }
+
+// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64
+// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) 
constant 
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
+// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+
+// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+// HOST: define internal void @__hip_register_globals
+// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]],
+// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr 
@__llvm_profile_sections_[[CUID]])
+
+// NONE-NOT: __llvm_profile_sections_
+// NONE-NOT: __llvm_profile_offload_register_shadow_variable
diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip 
b/clang/test/Driver/hip-profile-rocm-runtime.hip
new file mode 100644
index 0000000000000..0fe3ecd5ab660
--- /dev/null
+++ b/clang/test/Driver/hip-profile-rocm-runtime.hip
@@ -0,0 +1,32 @@
+// REQUIRES: x86-registered-target, amdgpu-registered-target
+// UNSUPPORTED: system-windows
+
+// Build a fake resource dir containing both the base profile runtime and the
+// ROCm device-profile runtime so the driver's existence check passes.
+// RUN: rm -rf %t && mkdir -p %t/lib/x86_64-unknown-linux
+// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile.a
+// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile_rocm.a
+// RUN: touch %t.o
+
+// HIP host link with PGO links clang_rt.profile_rocm.
+// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \
+// RUN:   -fprofile-instr-generate -resource-dir=%t \
+// RUN:   --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=HIP-PGO %s
+// The ROCm device-profile runtime is linked before the base profile runtime so
+// its reference to the collector hook resolves against the base archive.
+// HIP-PGO-DAG: "{{.*}}libclang_rt.profile_rocm.a"
+// HIP-PGO-DAG: "{{.*}}libclang_rt.profile.a"
+
+// Without profiling, the ROCm device-profile runtime is not linked.
+// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \
+// RUN:   -resource-dir=%t --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=HIP-NOPGO %s
+// HIP-NOPGO-NOT: libclang_rt.profile_rocm.a
+
+// A non-HIP host link with PGO does not link the ROCm device-profile runtime.
+// RUN: %clang -### --target=x86_64-unknown-linux \
+// RUN:   -fprofile-instr-generate -resource-dir=%t %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=HOST-PGO %s
+// HOST-PGO: "{{.*}}libclang_rt.profile.a"
+// HOST-PGO-NOT: libclang_rt.profile_rocm.a
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index cbdc48a9a717f..37bad559f49e7 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -500,6 +500,9 @@ def DEOPTIMIZE : RuntimeLibcall;
 // Return address
 def RETURN_ADDRESS : RuntimeLibcall;
 
+// GPU profiling
+def PROFILE_INSTRUMENT_GPU : RuntimeLibcall;
+
 // Clear cache
 def CLEAR_CACHE : RuntimeLibcall;
 def RISCV_FLUSH_ICACHE : RuntimeLibcall;
@@ -2241,8 +2244,10 @@ def WindowsARM64ECSystemLibrary
 
 def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">;
 
-// No calls.
-def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>;
+def __llvm_profile_instrument_gpu : RuntimeLibcallImpl<PROFILE_INSTRUMENT_GPU>;
+
+def AMDGPUSystemLibrary
+    : SystemRuntimeLibrary<isAMDGPU, (add __llvm_profile_instrument_gpu)>;
 
 
//===----------------------------------------------------------------------===//
 // ARM Runtime Libcalls
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h 
b/llvm/include/llvm/ProfileData/InstrProf.h
index dffc58281c2d9..b7f917ec39b70 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -122,6 +122,11 @@ inline StringRef getInstrProfValueProfMemOpFuncName() {
 /// Return the prefix of the name of the variables to function as a filter.
 inline StringRef getInstrProfVarPrefix() { return "__prof"; }
 
+/// Return the name of the GPU wave-cooperative counter increment helper.
+inline StringRef getInstrProfInstrumentGPUFuncName() {
+  return INSTR_PROF_INSTRUMENT_GPU_FUNC_STR;
+}
+
 /// Return the name prefix of variables containing instrumented function names.
 inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; }
 
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp 
b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index dabd495cddd49..8e4ba41919768 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -33,14 +34,17 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -287,6 +291,8 @@ class InstrLowerer final {
   GlobalVariable *NamesVar = nullptr;
   size_t NamesSize = 0;
 
+  StructType *ProfileDataTy = nullptr;
+
   // vector of counter load/store pairs to be register promoted.
   std::vector<LoadStorePair> PromotionCandidates;
 
@@ -407,6 +413,9 @@ class InstrLowerer final {
   /// Create a static initializer for our data, on platforms that need it,
   /// and for any profile output file that was specified.
   void emitInitialization();
+
+  /// Return the __llvm_profile_data struct type.
+  StructType *getProfileDataTy();
 };
 
 ///
@@ -1190,19 +1199,22 @@ void InstrLowerer::lowerTimestamp(
 
 void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   auto *Addr = getCounterAddress(Inc);
-
   IRBuilder<> Builder(Inc);
   if (isGPUProfTarget(M)) {
-    auto *I64Ty = Builder.getInt64Ty();
+    auto *Int64Ty = Builder.getInt64Ty();
     auto *PtrTy = Builder.getPtrTy();
     auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()),
-                                       {PtrTy, PtrTy, I64Ty}, false);
-    auto Callee =
-        M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy);
+                                       {PtrTy, PtrTy, Int64Ty}, false);
+    FunctionCallee Callee =
+        M.getOrInsertFunction(RTLIB::RuntimeLibcallsInfo::getLibcallImplName(
+                                  RTLIB::impl___llvm_profile_instrument_gpu),
+                              CalleeTy);
     Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
     Value *Uniform =
         ConstantPointerNull::get(PointerType::getUnqual(M.getContext()));
-    Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()});
+    Value *StepI64 =
+        Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64");
+    Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64});
   } else if (Options.Atomic || AtomicCounterUpdateAll ||
              (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
     Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
@@ -1400,6 +1412,12 @@ stat...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/201607
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to