https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/137655
None >From 531195729a62694205763accce085b46d9a5bc10 Mon Sep 17 00:00:00 2001 From: Shilei Tian <i...@tianshilei.me> Date: Sun, 27 Apr 2025 13:38:11 -0400 Subject: [PATCH] [AMDGPU] Remove the pass `AMDGPUPromoteKernelArguments` --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 - llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 - .../AMDGPU/AMDGPUPromoteKernelArguments.cpp | 219 ------------------ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 -- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - .../AMDGPU/promote-kernel-arguments.ll | 120 +++++----- .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 - 7 files changed, 52 insertions(+), 313 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4ff761ec19b3c..edbded03957dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -113,15 +113,6 @@ FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; -FunctionPass *createAMDGPUPromoteKernelArgumentsPass(); -void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &); -extern char &AMDGPUPromoteKernelArgumentsID; - -struct AMDGPUPromoteKernelArgumentsPass - : PassInfoMixin<AMDGPUPromoteKernelArgumentsPass> { - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; - ModulePass *createAMDGPULowerKernelAttributesPass(); void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); extern char &AMDGPULowerKernelAttributesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 98a1147ef6d66..30cf06d3b3dd0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -59,8 +59,6 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes", FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this)) FUNCTION_PASS("amdgpu-promote-alloca-to-vector", AMDGPUPromoteAllocaToVectorPass(*this)) -FUNCTION_PASS("amdgpu-promote-kernel-arguments", - AMDGPUPromoteKernelArgumentsPass()) FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass()) FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass()) FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp deleted file mode 100644 index 06819d05b4be6..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ /dev/null @@ -1,219 +0,0 @@ -//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This pass recursively promotes generic pointer arguments of a kernel -/// into the global address space. -/// -/// The pass walks kernel's pointer arguments, then loads from them. If a loaded -/// value is a pointer and loaded pointer is unmodified in the kernel before the -/// load, then promote loaded pointer to global. Then recursively continue. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUMemoryUtils.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/MemorySSA.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/InitializePasses.h" - -#define DEBUG_TYPE "amdgpu-promote-kernel-arguments" - -using namespace llvm; - -namespace { - -class AMDGPUPromoteKernelArguments : public FunctionPass { - MemorySSA *MSSA; - - AliasAnalysis *AA; - - Instruction *ArgCastInsertPt; - - SmallVector<Value *> Ptrs; - - void enqueueUsers(Value *Ptr); - - bool promotePointer(Value *Ptr); - - bool promoteLoad(LoadInst *LI); - -public: - static char ID; - - AMDGPUPromoteKernelArguments() : FunctionPass(ID) {} - - bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA); - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<MemorySSAWrapperPass>(); - AU.setPreservesAll(); - } -}; - -} // end anonymous namespace - -void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { - SmallVector<User *> PtrUsers(Ptr->users()); - - while (!PtrUsers.empty()) { - Instruction *U = dyn_cast<Instruction>(PtrUsers.pop_back_val()); - if (!U) - continue; - - switch (U->getOpcode()) { - default: - break; - case Instruction::Load: { - LoadInst *LD = cast<LoadInst>(U); - if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr && - !AMDGPU::isClobberedInFunction(LD, MSSA, AA)) - Ptrs.push_back(LD); - - break; - } - case Instruction::GetElementPtr: - case Instruction::AddrSpaceCast: - case Instruction::BitCast: - if (U->getOperand(0)->stripInBoundsOffsets() == Ptr) - PtrUsers.append(U->user_begin(), U->user_end()); - break; - } - } -} - -bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { - bool Changed = false; - - LoadInst *LI = dyn_cast<LoadInst>(Ptr); - if (LI) - Changed |= promoteLoad(LI); - - PointerType *PT = dyn_cast<PointerType>(Ptr->getType()); - if (!PT) - return Changed; - - if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) - enqueueUsers(Ptr); - - if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) - return Changed; - - IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator()) - : ArgCastInsertPt); - - // Cast pointer to global address space and back to flat and let - // Infer Address Spaces pass to do all necessary rewriting. - PointerType *NewPT = - PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS); - Value *Cast = - B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global")); - Value *CastBack = - B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat")); - Ptr->replaceUsesWithIf(CastBack, - [Cast](Use &U) { return U.getUser() != Cast; }); - - return true; -} - -bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) { - if (!LI->isSimple()) - return false; - - LI->setMetadata("amdgpu.noclobber", MDNode::get(LI->getContext(), {})); - return true; -} - -// skip allocas -static BasicBlock::iterator getInsertPt(BasicBlock &BB) { - BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); - for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { - AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt); - - // If this is a dynamic alloca, the value may depend on the loaded kernargs, - // so loads will need to be inserted before it. - if (!AI || !AI->isStaticAlloca()) - break; - } - - return InsPt; -} - -bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA, - AliasAnalysis &AA) { - if (skipFunction(F)) - return false; - - CallingConv::ID CC = F.getCallingConv(); - if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) - return false; - - ArgCastInsertPt = &*getInsertPt(*F.begin()); - this->MSSA = &MSSA; - this->AA = &AA; - - for (Argument &Arg : F.args()) { - if (Arg.use_empty()) - continue; - - PointerType *PT = dyn_cast<PointerType>(Arg.getType()); - if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS && - PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS && - PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)) - continue; - - Ptrs.push_back(&Arg); - } - - bool Changed = false; - while (!Ptrs.empty()) { - Value *Ptr = Ptrs.pop_back_val(); - Changed |= promotePointer(Ptr); - } - - return Changed; -} - -bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) { - MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); - return run(F, MSSA, AA); -} - -INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE, - "AMDGPU Promote Kernel Arguments", false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) -INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE, - "AMDGPU Promote Kernel Arguments", false, false) - -char AMDGPUPromoteKernelArguments::ID = 0; - -FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() { - return new AMDGPUPromoteKernelArguments(); -} - -PreservedAnalyses -AMDGPUPromoteKernelArgumentsPass::run(Function &F, - FunctionAnalysisManager &AM) { - MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); - AliasAnalysis &AA = AM.getResult<AAManager>(F); - if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) { - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - PA.preserve<MemorySSAAnalysis>(); - return PA; - } - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 150060e1b266c..53f41812e523d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -435,11 +435,6 @@ static cl::opt<bool> EnablePreRAOptimizations( cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden); -static cl::opt<bool> EnablePromoteKernelArguments( - "amdgpu-enable-promote-kernel-arguments", - cl::desc("Enable promotion of flat kernel pointer arguments to global"), - cl::Hidden, cl::init(true)); - static cl::opt<bool> EnableImageIntrinsicOptimizer( "amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), @@ -520,7 +515,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); - initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); @@ -854,13 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FunctionPassManager FPM; - // Add promote kernel arguments pass to the opt pipeline right before - // infer address spaces which is needed to do actual address space - // rewriting. - if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && - EnablePromoteKernelArguments) - FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); - // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. FPM.addPass(InferAddressSpacesPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..d1c67b408a95f 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -91,7 +91,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUPreloadKernArgProlog.cpp AMDGPUPrintfRuntimeBinding.cpp AMDGPUPromoteAlloca.cpp - AMDGPUPromoteKernelArguments.cpp AMDGPURegBankCombiner.cpp AMDGPURegBankLegalize.cpp AMDGPURegBankLegalizeHelper.cpp diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll index 0696cbe5aa891..aa7f820189507 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=infer-address-spaces | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s ; GCN-LABEL: ptr_nest_3: ; GCN-COUNT-2: global_load_dwordx2 @@ -11,11 +10,9 @@ define amdgpu_kernel void @ptr_nest_3(ptr addrspace(1) nocapture readonly %Arg) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]] -; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0:![0-9]+]] -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1) -; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4 +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr [[P2]], align 8 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -36,9 +33,8 @@ define amdgpu_kernel void @ptr_bitcast(ptr nocapture readonly %Arg) { ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I]] -; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1) -; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P2_GLOBAL]], align 4 +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8 +; CHECK-NEXT: store i32 0, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -57,11 +53,10 @@ entry: define amdgpu_kernel void @ptr_in_struct(ptr addrspace(1) nocapture readonly %Arg) { ; CHECK-LABEL: @ptr_in_struct( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = load ptr, ptr addrspace(1) [[ARG:%.*]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P1_GLOBAL:%.*]] = addrspacecast ptr [[P1]] to ptr addrspace(1) +; CHECK-NEXT: [[P1:%.*]] = load ptr, ptr addrspace(1) [[ARG:%.*]], align 8 ; CHECK-NEXT: [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[P1_GLOBAL]], i32 [[ID]] -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i32 [[ID]] +; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -85,36 +80,34 @@ define amdgpu_kernel void @flat_ptr_arg(ptr nocapture readonly noalias %Arg, ptr ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1) -; CHECK-NEXT: [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[OUT_GLOBAL]], i64 [[IDXPROM]] +; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[I1]], align 4 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]] ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1 -; CHECK-NEXT: [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 1 +; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 ; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]] ; CHECK-NEXT: store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2 -; CHECK-NEXT: [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 2 +; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX3_2]], align 4 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 ; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]] ; CHECK-NEXT: store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3 -; CHECK-NEXT: [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 3 +; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX3_3]], align 4 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 ; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]] ; CHECK-NEXT: store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]] ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[OUT_GLOBAL]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I7:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX11]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[I7_GLOBAL:%.*]] = addrspacecast ptr [[I7]] to ptr addrspace(1) +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]] +; CHECK-NEXT: [[I7:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX11]], align 8 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I7_GLOBAL]], i64 [[IDXPROM8]] -; CHECK-NEXT: store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[I7]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -161,23 +154,22 @@ define amdgpu_kernel void @global_ptr_arg(ptr addrspace(1) nocapture readonly %A ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1) -; CHECK-NEXT: [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[I1]], align 4 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]] ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1 -; CHECK-NEXT: [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 1 +; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 ; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]] ; CHECK-NEXT: store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2 -; CHECK-NEXT: [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 2 +; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX3_2]], align 4 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 ; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]] ; CHECK-NEXT: store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3 -; CHECK-NEXT: [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 3 +; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX3_3]], align 4 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 ; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]] ; CHECK-NEXT: store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4 @@ -185,8 +177,8 @@ define amdgpu_kernel void @global_ptr_arg(ptr addrspace(1) nocapture readonly %A ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]] ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]] -; CHECK-NEXT: store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -276,19 +268,18 @@ define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(ptr addrspace(1) ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1) +; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]] ; CHECK-NEXT: store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[I1]], align 4 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]] ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]] ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]] -; CHECK-NEXT: store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -319,11 +310,9 @@ define amdgpu_kernel void @ptr_nest_3_barrier(ptr addrspace(1) nocapture readonl ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]] ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1) -; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4 +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr [[P2]], align 8 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -345,9 +334,8 @@ define amdgpu_kernel void @flat_ptr_nest_2(ptr nocapture readonly %Arg, i32 %i) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]] -; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4 +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -366,8 +354,8 @@ define amdgpu_kernel void @const_ptr_nest_3(ptr addrspace(4) nocapture readonly ; CHECK-LABEL: @const_ptr_nest_3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]] -; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1) ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[TMP0]], align 4 ; CHECK-NEXT: ret void @@ -389,10 +377,9 @@ define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(ptr addrspace(4) noc ; CHECK-LABEL: @cast_from_const_const_ptr_nest_3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]] -; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4 +; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -416,8 +403,7 @@ define amdgpu_kernel void @flat_ptr_volatile_load(ptr nocapture readonly %Arg, i ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr ; CHECK-NEXT: [[P2:%.*]] = load volatile ptr, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -437,8 +423,7 @@ define amdgpu_kernel void @flat_ptr_atomic_load(ptr nocapture readonly %Arg, i32 ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]] ; CHECK-NEXT: [[P2:%.*]] = load atomic ptr, ptr addrspace(1) [[P1]] monotonic, align 8 -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -457,10 +442,9 @@ define amdgpu_kernel void @cast_changing_pointee_type(ptr addrspace(1) nocapture ; CHECK-LABEL: @cast_changing_pointee_type( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]] -; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2]], align 8, !amdgpu.noclobber [[META0]] -; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1) -; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4 +; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2]], align 8 +; CHECK-NEXT: store float 0.000000e+00, ptr [[P3]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index cdde7682582ac..950e39f31ac32 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -178,7 +178,6 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUPreloadKernArgProlog.cpp", "AMDGPUPrintfRuntimeBinding.cpp", "AMDGPUPromoteAlloca.cpp", - "AMDGPUPromoteKernelArguments.cpp", "AMDGPURegBankCombiner.cpp", "AMDGPURegBankLegalize.cpp", "AMDGPURegBankLegalizeHelper.cpp", _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits