llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Yaxun (Sam) Liu (yxsamliu)
<details>
<summary>Changes</summary>
This relands the compiler portion of #<!-- -->177665 ("[PGO][AMDGPU] Add basic
HIP offload PGO support"), which was reverted in #<!-- -->201416, and adds the
driver wiring needed for the new separate device-profile runtime library.
The first commit restores the reverted compiler state verbatim: the AMDGPU
instrumentation lowering in LLVM (InstrProfiling.cpp and
PGOInstrumentation.cpp), the HIP device-variable and shadow registration in
Clang codegen (CGCUDANV.cpp), the supporting InstrProf.h and RuntimeLibcalls.td
changes, and their tests. This part depends on the runtime ABI provided by the
compiler-rt change, which is relanded separately.
The second commit adds the driver wiring to link the ROCm device-profile
collection runtime. When building HIP with device PGO (-fprofile-generate or
-fprofile-instr-generate) the host program must drain device-side profile
counters. That collection runtime now lives in the separate
clang_rt.profile_rocm library, so the driver must link it into the host image
in addition to the base clang_rt.profile. This is done in addOffloadRTLibs,
which already knows the active offload kinds and links libamdhip64, for both
the Linux/GNU and MSVC host toolchains. It is gated on the HIP offload kind,
needsProfileRT, and the library being present in the resource directory.
Non-HIP profile links and HIP links without profiling are unaffected.
This change depends on the compiler-rt change that adds clang_rt.profile_rocm.
---
Patch is 33.50 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/201607.diff
13 Files Affected:
- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+152)
- (modified) clang/lib/Driver/ToolChains/Linux.cpp (+7)
- (modified) clang/lib/Driver/ToolChains/MSVC.cpp (+8)
- (added) clang/test/CodeGenHIP/offload-pgo-sections.hip (+50)
- (added) clang/test/Driver/hip-profile-rocm-runtime.hip (+32)
- (modified) llvm/include/llvm/IR/RuntimeLibcalls.td (+7-2)
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+5)
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+83-26)
- (modified) llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp (+2-1)
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
(+32)
- (added) llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll (+26)
- (added) llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll (+36)
- (added) llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
(+22)
``````````diff
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 259b6c040706b..65f398af7902b 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -28,6 +28,7 @@
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace clang;
using namespace CodeGen;
@@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()
llvm::GlobalVariable *GpuBinaryHandle = nullptr;
+ /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
+ /// emitted only for HIP host compiles when PGO is on. Registered via
+ /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
+ /// can locate the device-side table by name.
+ llvm::GlobalVariable *OffloadProfShadow = nullptr;
/// Whether we generate relocatable device code.
bool RelocatableDeviceCode;
/// Mangle context for device.
@@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime {
void transformManagedVars();
/// Create offloading entries to register globals in RDC mode.
void createOffloadingEntries();
+ /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
+ /// On the device side it is the populated 7-pointer section-bounds table.
+ /// On the host side it is a placeholder void* shadow stored in
+ /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
+ /// or createOffloadingEntries (RDC) so the runtime can locate the
+ /// device-side table by name.
+ void emitOffloadProfilingSections();
public:
CGNVCUDARuntime(CodeGenModule &CGM);
@@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
}
}
+ // Register the per-TU offload-profiling shadow so the host runtime can
+ // locate the matching device-side __llvm_profile_sections_<CUID>. We
+ // emit both __hipRegisterVar (so the HIP runtime can map the host
+ // shadow to the device symbol) and
+ // __llvm_profile_offload_register_shadow_variable (so the profile
+ // runtime adds the shadow to its drain list).
+ if (OffloadProfShadow) {
+ llvm::Constant *Name =
+ makeConstantString(std::string(OffloadProfShadow->getName()));
+ llvm::Value *RegisterVarArgs[] = {
+ &GpuBinaryHandlePtr,
+ OffloadProfShadow,
+ Name,
+ Name,
+ llvm::ConstantInt::get(IntTy, /*Extern=*/0),
+ llvm::ConstantInt::get(VarSizeTy,
CGM.getDataLayout().getPointerSize()),
+ llvm::ConstantInt::get(IntTy, /*Constant=*/0),
+ llvm::ConstantInt::get(IntTy, 0)};
+ Builder.CreateCall(RegisterVar, RegisterVarArgs);
+
+ llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+ "__llvm_profile_offload_register_shadow_variable");
+ Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
+ }
+
Builder.CreateRetVoid();
return RegisterKernelsFunc;
}
@@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() {
I.Flags.getSurfTexType());
}
}
+
+ // Register the per-TU offload-profiling shadow. The offloading entry
+ // makes the linker-wrapper emit the host __hipRegisterVar call in the
+ // combined ctor. Separately emit a per-TU ctor that registers the
+ // shadow with the profile runtime's drain list.
+ if (OffloadProfShadow) {
+ llvm::offloading::emitOffloadingEntry(
+ M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
+ CGM.getDataLayout().getPointerSize(),
+ llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+
+ llvm::LLVMContext &Ctx = M.getContext();
+ auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+ llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(VoidTy, {PtrTy}, false),
+ "__llvm_profile_offload_register_shadow_variable");
+ auto *CtorFn = llvm::Function::Create(
+ llvm::FunctionType::get(VoidTy, false),
+ llvm::GlobalValue::InternalLinkage,
+ "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(),
&M);
+ auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
+ llvm::IRBuilder<> B(Entry);
+ B.CreateCall(RegisterShadow, {OffloadProfShadow});
+ B.CreateRetVoid();
+ llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
+ }
+}
+
+// For HIP host+device compiles with PGO enabled, emit the per-TU global
+// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding
+// section start/stop bounds for the names/counters/data sections plus the
+// raw-version variable. Host side: an opaque void* shadow whose only
+// purpose is to give the host-runtime a registered symbol name to look up
+// via hipGetSymbolAddress; the actual device-side data lives in the
+// matching device-side global.
+void CGNVCUDARuntime::emitOffloadProfilingSections() {
+ if (!CGM.getLangOpts().HIP)
+ return;
+ if (!CGM.getCodeGenOpts().hasProfileInstr())
+ return;
+
+ StringRef CUIDHash = CGM.getContext().getCUIDHash();
+ if (CUIDHash.empty())
+ return;
+
+ llvm::Module &M = CGM.getModule();
+ llvm::LLVMContext &Ctx = M.getContext();
+ std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
+
+ // If the global already exists (e.g. another TU was merged in), don't
+ // duplicate it.
+ if (M.getNamedValue(Name))
+ return;
+
+ if (CGM.getLangOpts().CUDAIsDevice) {
+ // Device side: emit the populated struct. Section start/stop symbols
+ // are linker-defined (ELF auto-generates __start_/__stop_ for any
+ // section whose name is a valid C identifier; AMDGPU is ELF).
+ unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+ auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
+ auto getOrDeclare = [&](StringRef SymName) {
+ if (auto *GV = M.getNamedGlobal(SymName))
+ return GV;
+ auto *GV = new llvm::GlobalVariable(
+ M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
+ llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
+ /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+ GlobalAS);
+ GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+ return GV;
+ };
+ auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
+ if (!VersionGV) {
+ VersionGV = new llvm::GlobalVariable(
+ M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
+ llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
+ "__llvm_profile_raw_version",
+ /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+ GlobalAS);
+ }
+
+ auto *StructTy = llvm::StructType::get(
+ Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
+ llvm::Constant *Fields[] = {
+ getOrDeclare("__start___llvm_prf_names"),
+ getOrDeclare("__stop___llvm_prf_names"),
+ getOrDeclare("__start___llvm_prf_cnts"),
+ getOrDeclare("__stop___llvm_prf_cnts"),
+ getOrDeclare("__start___llvm_prf_data"),
+ getOrDeclare("__stop___llvm_prf_data"),
+ VersionGV,
+ };
+ auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
+ auto *GV = new llvm::GlobalVariable(
+ M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+ Init, Name, /*InsertBefore=*/nullptr,
llvm::GlobalValue::NotThreadLocal,
+ GlobalAS);
+ GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
+ CGM.addCompilerUsedGlobal(GV);
+ return;
+ }
+
+ // Host side: emit an opaque void* shadow. Layout doesn't matter — the
+ // runtime locates it by name via hipGetSymbolAddress and treats it as
+ // the address of the device-side struct. Registration with the HIP
+ // runtime is added by makeRegisterGlobalsFn (non-RDC) or
+ // createOffloadingEntries (RDC).
+ auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
+ OffloadProfShadow = new llvm::GlobalVariable(
+ M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
+ llvm::ConstantPointerNull::get(PtrTy), Name);
+ CGM.addCompilerUsedGlobal(OffloadProfShadow);
}
// Returns module constructor to be added.
llvm::Function *CGNVCUDARuntime::finalizeModule() {
transformManagedVars();
+ emitOffloadProfilingSections();
if (CGM.getLangOpts().CUDAIsDevice) {
// Mark ODR-used device variables as compiler used to prevent it from being
// eliminated by optimization. This is necessary for device variables
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp
b/clang/lib/Driver/ToolChains/Linux.cpp
index 5f04afe34c554..227e5e2811dac 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -902,6 +902,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const
ArgList &Args,
if (ActiveKinds & Action::OFK_HIP)
CmdArgs.push_back(
Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath()));
+
+ // For HIP with device PGO, link the host-side ROCm device-profile collection
+ // runtime (clang_rt.profile_rocm) when it is available. It registers a
+ // collector with the base profile runtime to drain device counters.
+ if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) &&
+ getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
}
void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp
b/clang/lib/Driver/ToolChains/MSVC.cpp
index 6bc58699fb007..7829831f7d103 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -592,6 +592,14 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds,
const ArgList &Args,
CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") +
RocmInstallation->getLibPath()),
"amdhip64.lib"});
+
+ // For HIP with device PGO, link the host-side ROCm device-profile
+ // collection runtime (clang_rt.profile_rocm) when it is available. It
+ // registers a collector with the base profile runtime to drain device
+ // counters.
+ if (needsProfileRT(Args) &&
+ getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static)))
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm"));
}
}
diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip
b/clang/test/CodeGenHIP/offload-pgo-sections.hip
new file mode 100644
index 0000000000000..17c6fe7b9e609
--- /dev/null
+++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip
@@ -0,0 +1,50 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: x86-registered-target
+
+// Verify CGCUDANV emits the per-TU __llvm_profile_sections_<CUID> global
+// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct
+// in addrspace(1). Host compile: void* shadow registered with the HIP
+// runtime and with the profile runtime's drain list.
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefix=DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \
+// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefix=HOST %s
+
+// Guard: no PGO -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN: -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefix=NONE %s
+
+// Guard: no CUID -> no emission.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefix=NONE %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ int helper(int x) { return x + 1; }
+__global__ void kernel(int *p) { *p = helper(*p); }
+
+// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8
+// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8
+// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64
+// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1)
constant
{{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version
+// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+
+// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null
+// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]]
+// HOST: define internal void @__hip_register_globals
+// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]],
+// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr
@__llvm_profile_sections_[[CUID]])
+
+// NONE-NOT: __llvm_profile_sections_
+// NONE-NOT: __llvm_profile_offload_register_shadow_variable
diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip
b/clang/test/Driver/hip-profile-rocm-runtime.hip
new file mode 100644
index 0000000000000..0fe3ecd5ab660
--- /dev/null
+++ b/clang/test/Driver/hip-profile-rocm-runtime.hip
@@ -0,0 +1,32 @@
+// REQUIRES: x86-registered-target, amdgpu-registered-target
+// UNSUPPORTED: system-windows
+
+// Build a fake resource dir containing both the base profile runtime and the
+// ROCm device-profile runtime so the driver's existence check passes.
+// RUN: rm -rf %t && mkdir -p %t/lib/x86_64-unknown-linux
+// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile.a
+// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile_rocm.a
+// RUN: touch %t.o
+
+// HIP host link with PGO links clang_rt.profile_rocm.
+// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \
+// RUN: -fprofile-instr-generate -resource-dir=%t \
+// RUN: --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN: | FileCheck -check-prefix=HIP-PGO %s
+// The ROCm device-profile runtime is linked before the base profile runtime so
+// its reference to the collector hook resolves against the base archive.
+// HIP-PGO-DAG: "{{.*}}libclang_rt.profile_rocm.a"
+// HIP-PGO-DAG: "{{.*}}libclang_rt.profile.a"
+
+// Without profiling, the ROCm device-profile runtime is not linked.
+// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \
+// RUN: -resource-dir=%t --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN: | FileCheck -check-prefix=HIP-NOPGO %s
+// HIP-NOPGO-NOT: libclang_rt.profile_rocm.a
+
+// A non-HIP host link with PGO does not link the ROCm device-profile runtime.
+// RUN: %clang -### --target=x86_64-unknown-linux \
+// RUN: -fprofile-instr-generate -resource-dir=%t %t.o 2>&1 \
+// RUN: | FileCheck -check-prefix=HOST-PGO %s
+// HOST-PGO: "{{.*}}libclang_rt.profile.a"
+// HOST-PGO-NOT: libclang_rt.profile_rocm.a
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index cbdc48a9a717f..37bad559f49e7 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -500,6 +500,9 @@ def DEOPTIMIZE : RuntimeLibcall;
// Return address
def RETURN_ADDRESS : RuntimeLibcall;
+// GPU profiling
+def PROFILE_INSTRUMENT_GPU : RuntimeLibcall;
+
// Clear cache
def CLEAR_CACHE : RuntimeLibcall;
def RISCV_FLUSH_ICACHE : RuntimeLibcall;
@@ -2241,8 +2244,10 @@ def WindowsARM64ECSystemLibrary
def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">;
-// No calls.
-def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>;
+def __llvm_profile_instrument_gpu : RuntimeLibcallImpl<PROFILE_INSTRUMENT_GPU>;
+
+def AMDGPUSystemLibrary
+ : SystemRuntimeLibrary<isAMDGPU, (add __llvm_profile_instrument_gpu)>;
//===----------------------------------------------------------------------===//
// ARM Runtime Libcalls
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h
b/llvm/include/llvm/ProfileData/InstrProf.h
index dffc58281c2d9..b7f917ec39b70 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -122,6 +122,11 @@ inline StringRef getInstrProfValueProfMemOpFuncName() {
/// Return the prefix of the name of the variables to function as a filter.
inline StringRef getInstrProfVarPrefix() { return "__prof"; }
+/// Return the name of the GPU wave-cooperative counter increment helper.
+inline StringRef getInstrProfInstrumentGPUFuncName() {
+ return INSTR_PROF_INSTRUMENT_GPU_FUNC_STR;
+}
+
/// Return the name prefix of variables containing instrumented function names.
inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; }
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index dabd495cddd49..8e4ba41919768 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -23,6 +23,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -33,14 +34,17 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
#include "llvm/ProfileData/InstrProf.h"
@@ -287,6 +291,8 @@ class InstrLowerer final {
GlobalVariable *NamesVar = nullptr;
size_t NamesSize = 0;
+ StructType *ProfileDataTy = nullptr;
+
// vector of counter load/store pairs to be register promoted.
std::vector<LoadStorePair> PromotionCandidates;
@@ -407,6 +413,9 @@ class InstrLowerer final {
/// Create a static initializer for our data, on platforms that need it,
/// and for any profile output file that was specified.
void emitInitialization();
+
+ /// Return the __llvm_profile_data struct type.
+ StructType *getProfileDataTy();
};
///
@@ -1190,19 +1199,22 @@ void InstrLowerer::lowerTimestamp(
void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
auto *Addr = getCounterAddress(Inc);
-
IRBuilder<> Builder(Inc);
if (isGPUProfTarget(M)) {
- auto *I64Ty = Builder.getInt64Ty();
+ auto *Int64Ty = Builder.getInt64Ty();
auto *PtrTy = Builder.getPtrTy();
auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()),
- {PtrTy, PtrTy, I64Ty}, false);
- auto Callee =
- M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy);
+ {PtrTy, PtrTy, Int64Ty}, false);
+ FunctionCallee Callee =
+ M.getOrInsertFunction(RTLIB::RuntimeLibcallsInfo::getLibcallImplName(
+ RTLIB::impl___llvm_profile_instrument_gpu),
+ CalleeTy);
Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
Value *Uniform =
ConstantPointerNull::get(PointerType::getUnqual(M.getContext()));
- Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()});
+ Value *StepI64 =
+ Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64");
+ Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64});
} else if (Options.Atomic || AtomicCounterUpdateAll ||
(Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
@@ -1400,6 +1412,12 @@ stat...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/201607
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits