llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Yaxun (Sam) Liu (yxsamliu)
<details>
<summary>Changes</summary>
The AMDGPUSimplifyLibCalls pass can fold separate sin(x) and cos(x) calls
into a single sincos(x) call, but this was not firing for HIP programs.
HIP math wrappers use static inline (for consistency with the CUDA clang
headers), producing internal linkage and _ZL-prefixed mangled names like
_ZL3sind that AMDGPULibFunc::parse does not recognize. Handle this in
fold() by stripping the L from _ZL and retrying the parse, but only for
sin and cos so the broader pass behaviour is unchanged.
The pass only checked CArgVal->users() to find partner sin/cos calls. When
sin and cos receive different SSA values from redundant loads of the same
address (pre-CSE), the partner was missed. Replace this with a function-wide
scan that matches equivalent arguments (same SSA value or loads from the
same pointer).
The pass looked for OpenCL-style mangled sincos which doesn't exist in HIP
modules. Add a fallback to look up __ocml_sincos_f{32,64}.
The device library is demand-linked so __ocml_sincos_f{32,64} was never
pulled in since user code only references sin and cos. Inject sincos
declarations with @<!-- -->llvm.compiler.used entries in emitTargetGlobals
before
device library linking so the demand-linker pulls in the definitions. A
late cleanup pass (AMDGPUUnusedLibFuncCleanupPass) removes unused sincos
after optimization to avoid dead-code overhead.
Fixes: LCOMPILER-19
---
Patch is 25.62 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/181774.diff
10 Files Affected:
- (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (+32)
- (added) clang/test/CodeGenHIP/sincos-demand-injection.hip (+38)
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp (+103-31)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+8)
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos-ocml.ll (+94)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll (+48)
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-unused-libfunc-cleanup-used.ll (+28)
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-unused-libfunc-cleanup.ll (+32)
``````````diff
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4ac7f42289d6d..588a7d4ab2af2 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -10,6 +10,7 @@
#include "TargetInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace clang;
using namespace clang::CodeGen;
@@ -321,6 +322,7 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
bool shouldEmitStaticExternCAliases() const override;
bool shouldEmitDWARFBitFieldSeparators() const override;
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
+ void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
};
}
@@ -763,6 +765,36 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
assert(Max == 0 && "Max must be zero");
}
+// If the module references both __ocml_sin and __ocml_cos for a given type,
+// inject a declaration + @llvm.compiler.used entry for the corresponding
+// __ocml_sincos so the demand-linker pulls it in from the device library.
+// The @llvm.compiler.used entry prevents early GlobalDCE from removing sincos
+// before the AMDGPUSimplifyLibCallsPass can use it. A late cleanup pass
+// (AMDGPUUnusedLibFuncCleanupPass, registered at OptimizerLastEP) removes
+// unused sincos after optimization.
+void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
+ CodeGen::CodeGenModule &CGM) const {
+ llvm::Module &M = CGM.getModule();
+ llvm::SmallVector<llvm::GlobalValue *, 2> ToAdd;
+
+ for (bool IsF32 : {true, false}) {
+ auto *Sin = M.getFunction(IsF32 ? "__ocml_sin_f32" : "__ocml_sin_f64");
+ auto *Cos = M.getFunction(IsF32 ? "__ocml_cos_f32" : "__ocml_cos_f64");
+ const char *Name = IsF32 ? "__ocml_sincos_f32" : "__ocml_sincos_f64";
+ if (!Sin || !Cos || M.getFunction(Name))
+ continue;
+ llvm::Type *FPTy = IsF32 ? llvm::Type::getFloatTy(M.getContext())
+ : llvm::Type::getDoubleTy(M.getContext());
+ llvm::Type *PtrTy = llvm::PointerType::get(M.getContext(), 5);
+ ToAdd.push_back(llvm::Function::Create(
+ llvm::FunctionType::get(FPTy, {FPTy, PtrTy}, false),
+ llvm::GlobalValue::ExternalLinkage, Name, &M));
+ }
+
+ if (!ToAdd.empty())
+ llvm::appendToCompilerUsed(M, ToAdd);
+}
+
std::unique_ptr<TargetCodeGenInfo>
CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
diff --git a/clang/test/CodeGenHIP/sincos-demand-injection.hip
b/clang/test/CodeGenHIP/sincos-demand-injection.hip
new file mode 100644
index 0000000000000..1aacce366408e
--- /dev/null
+++ b/clang/test/CodeGenHIP/sincos-demand-injection.hip
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fcuda-is-device \
+// RUN: -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fcuda-is-device \
+// RUN: -emit-llvm -o - %s -DSIN_ONLY | FileCheck -check-prefix=NOSINCOS %s
+
+// Test that when a HIP device compilation sees calls to both __ocml_sin and
+// __ocml_cos for a given type, Clang injects an __ocml_sincos declaration
+// and adds it to @llvm.compiler.used so the demand-linker pulls it in from
+// the device library.
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+extern "C" __device__ float __ocml_sin_f32(float);
+extern "C" __device__ float __ocml_cos_f32(float);
+extern "C" __device__ double __ocml_sin_f64(double);
+extern "C" __device__ double __ocml_cos_f64(double);
+
+#ifdef SIN_ONLY
+// Only sin, no cos — sincos should NOT be injected.
+// NOSINCOS-NOT: __ocml_sincos
+__global__ void kernel_sin_only(float *fout, double *dout, float fx, double
dx) {
+ fout[0] = __ocml_sin_f32(fx);
+ dout[0] = __ocml_sin_f64(dx);
+}
+#else
+// Both sin and cos for f32 and f64 — sincos should be injected for both.
+// CHECK-DAG: @llvm.compiler.used =
{{.*}}@__ocml_sincos_f32{{.*}}@__ocml_sincos_f64
+// CHECK-DAG: declare float @__ocml_sincos_f32(float, ptr addrspace(5))
+// CHECK-DAG: declare double @__ocml_sincos_f64(double, ptr addrspace(5))
+__global__ void kernel_sin_cos_f32_f64(float *fout, double *dout, float fx,
double dx) {
+ fout[0] = __ocml_sin_f32(fx);
+ fout[1] = __ocml_cos_f32(fx);
+ dout[0] = __ocml_sin_f64(dx);
+ dout[1] = __ocml_cos_f64(dx);
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..e72f3022f8db0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -86,6 +86,11 @@ struct AMDGPUUseNativeCallsPass :
PassInfoMixin<AMDGPUUseNativeCallsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
+struct AMDGPUUnusedLibFuncCleanupPass
+ : PassInfoMixin<AMDGPUUnusedLibFuncCleanupPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
class SILowerI1CopiesPass : public PassInfoMixin<SILowerI1CopiesPass> {
public:
SILowerI1CopiesPass() = default;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 4de9349fe5166..7e490bda2042a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <cmath>
#define DEBUG_TYPE "amdgpu-simplifylib"
@@ -568,8 +569,21 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
return false;
FuncInfo FInfo;
- if (!parseFunctionName(Callee->getName(), FInfo))
- return false;
+ if (!parseFunctionName(Callee->getName(), FInfo)) {
+ // HIP math wrappers use static inline (for consistency with the CUDA clang
+ // headers), producing _ZL-prefixed names like _ZL3sind. Strip the 'L' and
+ // retry, but only proceed for sin/cos so the broader pass behaviour is
+ // unchanged.
+ StringRef Name = Callee->getName();
+ if (!Name.starts_with("_ZL"))
+ return false;
+ std::string Stripped = ("_Z" + Name.drop_front(3)).str();
+ if (!parseFunctionName(Stripped, FInfo))
+ return false;
+ if (FInfo.getId() != AMDGPULibFunc::EI_SIN &&
+ FInfo.getId() != AMDGPULibFunc::EI_COS)
+ return false;
+ }
// Further check the number of arguments to see if they match.
// TODO: Check calling convention matches too
@@ -1313,6 +1327,18 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp,
IRBuilder<> &B,
FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
+
+ // For HIP, the OpenCL-style mangled sincos may not exist. Fall back to
+ // __ocml_sincos_f{32,64} which has the same calling convention: returns sin
+ // value and stores cos through a private pointer.
+ if (!FSinCos) {
+ StringRef OcmlName = getArgType(fInfo) == AMDGPULibFunc::F32
+ ? "__ocml_sincos_f32"
+ : "__ocml_sincos_f64";
+ if (Function *OcmlSinCos = M->getFunction(OcmlName))
+ FSinCos = FunctionCallee(OcmlSinCos->getFunctionType(), OcmlSinCos);
+ }
+
if (!FSinCos)
return false;
@@ -1321,10 +1347,17 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp,
IRBuilder<> &B,
SmallVector<CallInst *> SinCosCalls;
FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
fInfo);
- const std::string PairName = PartnerInfo.mangle();
+ std::string PairName = PartnerInfo.mangle();
+
+ // mangle() always produces _Z-prefixed names, but the HIP math wrappers
+ // are static inline and use _ZL (internal linkage) mangling. Adjust the
+ // partner name to match.
+ StringRef OrigName = CI->getCalledFunction()->getName();
+ if (OrigName.starts_with("_ZL"))
+ PairName.insert(2, "L");
- StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
- StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
+ StringRef SinName = isSin ? OrigName : StringRef(PairName);
+ StringRef CosName = isSin ? StringRef(PairName) : OrigName;
const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
@@ -1334,33 +1367,49 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp,
IRBuilder<> &B,
SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
- for (User* U : CArgVal->users()) {
- CallInst *XI = dyn_cast<CallInst>(U);
- if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
- continue;
-
- Function *UCallee = XI->getCalledFunction();
- if (!UCallee)
- continue;
-
- bool Handled = true;
+ // Scan all calls in the function for sin/cos/sincos with equivalent
+ // arguments. We cannot just iterate CArgVal->users() because the partner
+ // call may use a different load from the same address that hasn't been
CSE'd.
+ for (BasicBlock &BB : *F) {
+ for (Instruction &I : BB) {
+ CallInst *XI = dyn_cast<CallInst>(&I);
+ if (!XI || XI->isNoBuiltin())
+ continue;
+
+ Function *UCallee = XI->getCalledFunction();
+ if (!UCallee || XI->arg_size() < 1)
+ continue;
+
+ // Check for equivalent arguments: same SSA value, or both loads from
+ // the same pointer (which haven't been CSE'd yet).
+ Value *XIArg = XI->getArgOperand(0);
+ if (CArgVal != XIArg) {
+ auto *LA = dyn_cast<LoadInst>(CArgVal);
+ auto *LB = dyn_cast<LoadInst>(XIArg);
+ if (!LA || !LB || LA->getPointerOperand() != LB->getPointerOperand())
+ continue;
+ }
- if (UCallee->getName() == SinName)
- SinCalls.push_back(XI);
- else if (UCallee->getName() == CosName)
- CosCalls.push_back(XI);
- else if (UCallee->getName() == SinCosPrivateName ||
- UCallee->getName() == SinCosGenericName)
- SinCosCalls.push_back(XI);
- else
- Handled = false;
-
- if (Handled) {
- MergeDbgLocs.push_back(XI->getDebugLoc());
- auto *OtherOp = cast<FPMathOperator>(XI);
- FMF &= OtherOp->getFastMathFlags();
- FPMath = MDNode::getMostGenericFPMath(
- FPMath, XI->getMetadata(LLVMContext::MD_fpmath));
+ bool Handled = true;
+ StringRef CalleeName = UCallee->getName();
+
+ if (CalleeName == SinName)
+ SinCalls.push_back(XI);
+ else if (CalleeName == CosName)
+ CosCalls.push_back(XI);
+ else if (CalleeName == SinCosPrivateName ||
+ CalleeName == SinCosGenericName)
+ SinCosCalls.push_back(XI);
+ else
+ Handled = false;
+
+ if (Handled) {
+ MergeDbgLocs.push_back(XI->getDebugLoc());
+ auto *OtherOp = cast<FPMathOperator>(XI);
+ FMF &= OtherOp->getFastMathFlags();
+ FPMath = MDNode::getMostGenericFPMath(
+ FPMath, XI->getMetadata(LLVMContext::MD_fpmath));
+ }
}
}
@@ -1681,3 +1730,26 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function
&F,
}
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
+
+PreservedAnalyses AMDGPUUnusedLibFuncCleanupPass::run(Module &M,
+ ModuleAnalysisManager
&AM) {
+ // Remove device-library functions from @llvm.compiler.used and erase them
+ // if they have no callers. These may have been eagerly pulled in before
+ // device-library linking to enable later optimisation passes (e.g. sin/cos
+ // → sincos merging); after those passes have run we clean up any that went
+ // unused.
+ bool Changed = false;
+ for (StringRef Name : {"__ocml_sincos_f32", "__ocml_sincos_f64"}) {
+ Function *F = M.getFunction(Name);
+ if (!F)
+ continue;
+ if (any_of(F->uses(), [](const Use &U) { return
isa<CallBase>(U.getUser()); }))
+ continue;
+ removeFromUsedLists(M, [F](Constant *C) {
+ return C->stripPointerCasts() == F;
+ });
+ F->eraseFromParent();
+ Changed = true;
+ }
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index f464fbf31c754..1df3212a34c71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -37,6 +37,7 @@ MODULE_PASS("amdgpu-printf-runtime-binding",
AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions",
AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass())
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
+MODULE_PASS("amdgpu-unused-libfunc-cleanup", AMDGPUUnusedLibFuncCleanupPass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 49c60c254f6f7..ae8a5ed2a8674 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -979,6 +979,14 @@ void
AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
+ PB.registerOptimizerLastEPCallback(
+ [](ModulePassManager &MPM, OptimizationLevel Level,
+ ThinOrFullLTOPhase Phase) {
+ if (Level == OptimizationLevel::O0)
+ return;
+ MPM.addPass(AMDGPUUnusedLibFuncCleanupPass());
+ });
+
PB.registerCGSCCOptimizerLateEPCallback(
[this](CGSCCPassManager &PM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos-ocml.ll
b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos-ocml.ll
new file mode 100644
index 0000000000000..ae01ad351e807
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos-ocml.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 2
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall=1 < %s |
FileCheck %s
+
+; Test the realistic HIP sin/cos → sincos optimisation path.
+; In a real HIP compilation:
+; 1. HIP math wrappers are static inline, producing _ZL-prefixed names
+; (e.g. _ZL3sind instead of _Z3sind).
+; 2. CodeGenAction injects __ocml_sincos_f{32,64} declarations and adds
+; them to @llvm.compiler.used so the demand-linker pulls them in.
+; 3. AMDGPUSimplifyLibCallsPass (at PeepholeEP) recognises the _ZL names
+; and merges sin/cos into __ocml_sincos_f{32,64}.
+; 4. AMDGPUUnusedLibFuncCleanupPass (at OptimizerLastEP) removes unused
+; sincos entries from @llvm.compiler.used.
+;
+; This test mirrors that flow: functions use _ZL mangling,
+; __ocml_sincos is kept alive by @llvm.compiler.used, and the pass
+; runs as part of the -O1 pipeline (non-prelink).
+
+; _ZL mangled sin/cos — as produced by HIP's static inline wrappers.
+declare float @_ZL3sinf(float) #0
+declare float @_ZL3cosf(float) #0
+declare double @_ZL3sind(double) #0
+declare double @_ZL3cosd(double) #0
+
+; Only OCML-style sincos is available — no _Z6sincos* declarations.
+declare float @__ocml_sincos_f32(float, ptr addrspace(5) writeonly) #1
+declare double @__ocml_sincos_f64(double, ptr addrspace(5) writeonly) #1
+
+; Keep sincos alive through the pipeline, as CodeGenAction does.
[email protected] = appending global [2 x ptr] [
+ ptr @__ocml_sincos_f32,
+ ptr @__ocml_sincos_f64
+], section "llvm.metadata"
+
+define void @sincos_f32_ocml(float %x, ptr addrspace(1) nocapture writeonly
%sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
+; CHECK-LABEL: define void @sincos_f32_ocml
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) writeonly captures(none)
initializes((0, 4)) [[SIN_OUT:%.*]], ptr addrspace(1) writeonly captures(none)
initializes((0, 4)) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP0:%.*]] = call contract float @__ocml_sincos_f32(float
[[X]], ptr addrspace(5) [[__SINCOS_]])
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]],
align 4
+; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %call_sin = tail call contract float @_ZL3sinf(float %x)
+ store float %call_sin, ptr addrspace(1) %sin_out, align 4
+ %call_cos = tail call contract float @_ZL3cosf(float %x)
+ store float %call_cos, ptr addrspace(1) %cos_out, align 4
+ ret void
+}
+
+define void @sincos_f64_ocml(double %x, ptr addrspace(1) nocapture writeonly
%sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
+; CHECK-LABEL: define void @sincos_f64_ocml
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) writeonly captures(none)
initializes((0, 8)) [[SIN_OUT:%.*]], ptr addrspace(1) writeonly captures(none)
initializes((0, 8)) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP0:%.*]] = call contract double @__ocml_sincos_f64(double
[[X]], ptr addrspace(5) [[__SINCOS_]])
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(5) [[__SINCOS_]],
align 8
+; CHECK-NEXT: store double [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT: store double [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %call_sin = tail call contract double @_ZL3sind(double %x)
+ store double %call_sin, ptr addrspace(1) %sin_out, align 8
+ %call_cos = tail call contract double @_ZL3cosd(double %x)
+ store double %call_cos, ptr addrspace(1) %cos_out, align 8
+ ret void
+}
+
+; Verify that sin/cos with different arguments are NOT merged,
+; even when __ocml_sincos is available.
+define void @sincos_f32_ocml_no_merge_different_args(float %x, float %y, ptr
addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly
%cos_out) {
+; CHECK-LABEL: define void @sincos_f32_ocml_no_merge_different_args
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], ptr addrspace(1) writeonly
captures(none) initializes((0, 4)) [[SIN_OUT:%.*]], ptr addrspace(1) writeonly
captures(none) initializes((0, 4)) [[COS_OUT:%.*]]) local_unnamed_addr
#[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CALL_SIN:%.*]] = tail call contract float @_ZL3sinf(float
[[X]])
+; CHECK-NEXT: store float [[CALL_SIN]], ptr addrspace(1) [[SIN_OUT]], align
4
+; CHECK-NEXT: [[CALL_COS:%.*]] = tail call contract float @_ZL3cosf(float
[[Y]])
+; CHECK-NEXT: store float [[CALL_COS]], ptr addrspace(1) [[COS_OUT]], align
4
+; CHECK-NEXT: ret void
+;
+entry:
+ %call_sin = tail call contract float @_ZL3sinf(float %x)
+ store float %call_sin, ptr addrspace(1) %sin_out, align 4
+ %call_cos = tail call contract float @_ZL3cosf(float %y)
+ store float %call_cos, ptr addrspace(1) %cos_out, align 4
+ ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn
memory(none) }
+attributes #1 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index ca3a68ce161ed..fc549f98f75a3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -1266,6 +1266,54 @@ entry:
ret float %sin2
}
+; Test that sin and cos with different loads from the same pointer are merged.
+; Before CSE, sin and cos may receive different SSA values that are loads from
+; the same address. The pass should recognize these as equivalent arguments.
+define void @sincos_f32_equivalent_load_args(ptr addrspace(1) %x_ptr, ptr
addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly
%cos_out) {
+; CHECK-LABEL: define void @sincos_f32_equivalent_load_args
+; CHECK-SAME: (ptr addrspace(1) readonly captures(none) [[X_PTR:%.*]], ptr
addrspace(1) writeonly captures(none) initializes((0, 4)) [[SIN_OUT:%.*...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/181774
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits