llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Yoonseo Choi (yoonseoch) <details> <summary>Changes</summary> Move AMDGPUAttributor into an earlier stage. More specifically, after InternalizePass before many IR level optimizations so that amdgpu attributes propagated across call graph can be used more in various optimization passes. In addition, logics in AMDGPULowerKernelAttributes are merged into AMDGPUAttributor and the pass is removed. --- Patch is 84.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/177432.diff 14 Files Affected: - (modified) clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp (+26-18) - (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (-9) - (modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+361-1) - (removed) llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp (-443) - (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+8-21) - (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (-1) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll (+5-5) - (added) llvm/test/CodeGen/AMDGPU/early-attributor.ll (+504) - (modified) llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll (+24-13) - (modified) llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll (+1-1) - (modified) llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn (-1) ``````````diff diff --git a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp index 8f92d1fed1f9f..b6645409722aa 100644 --- a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp +++ b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp @@ -81,7 +81,7 @@ __global__ void k4(SingleElement) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local amdgpu_kernel void @_Z2k55ByRef( -// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -117,7 +117,7 @@ __global__ void k7(unsigned*) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f0s( -// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -129,7 +129,7 @@ __device__ void f0(short) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f1j( -// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -141,7 +141,7 @@ __device__ void f1(unsigned) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f2d( -// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -153,7 +153,7 @@ __device__ void f2(double) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f311Transparent( -// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -165,7 +165,7 @@ __device__ void f3(Transparent) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f413SingleElement( -// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -177,7 +177,7 @@ __device__ void f4(SingleElement) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f55ByRef( -// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -189,7 +189,7 @@ __device__ void f5(ByRef) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f6Dv1_jDv2_jDv3_jDv4_j( -// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -201,7 +201,7 @@ __device__ void f6(V1, V2, V3, V4) { } // AMDGCNSPIRV-NEXT: ret i16 0 // // AMDGPU-LABEL: define dso_local noundef signext i16 @_Z2f7v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i16 0 // @@ -213,7 +213,7 @@ __device__ short f7() { return 0; } // AMDGCNSPIRV-NEXT: ret i32 0 // // AMDGPU-LABEL: define dso_local noundef i32 @_Z2f8v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i32 0 // @@ -225,7 +225,7 @@ __device__ unsigned f8() { return 0; } // AMDGCNSPIRV-NEXT: ret double 0.000000e+00 // // AMDGPU-LABEL: define dso_local noundef double @_Z2f9v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret double 0.000000e+00 // @@ -237,7 +237,7 @@ __device__ double f9() { return 0.; } // AMDGCNSPIRV-NEXT: ret i32 0 // // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f10v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i32 0 // @@ -249,7 +249,7 @@ __device__ Transparent f10() { return {}; } // AMDGCNSPIRV-NEXT: ret i32 0 // // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f11v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i32 0 // @@ -262,7 +262,7 @@ __device__ SingleElement f11() { return {}; } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z3f12v( -// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 4 dereferenceable(68) [[AGG_RESULT]], i8 0, i64 68, i1 false) // AMDGPU-NEXT: ret void @@ -275,7 +275,7 @@ __device__ ByRef f12() { return {}; } // AMDGCNSPIRV-NEXT: ret <1 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <1 x i32> @_Z3f13v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <1 x i32> zeroinitializer // @@ -287,7 +287,7 @@ __device__ V1 f13() { return {}; } // AMDGCNSPIRV-NEXT: ret <2 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <2 x i32> @_Z3f14v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <2 x i32> zeroinitializer // @@ -299,7 +299,7 @@ __device__ V2 f14() { return {}; } // AMDGCNSPIRV-NEXT: ret <3 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <3 x i32> @_Z3f15v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <3 x i32> zeroinitializer // @@ -311,7 +311,7 @@ __device__ V3 f15() { return {}; } // AMDGCNSPIRV-NEXT: ret <4 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <4 x i32> @_Z3f16v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <4 x i32> zeroinitializer // @@ -319,3 +319,11 @@ __device__ V4 f16() { return {}; } //. // AMDGCNSPIRV: [[META9]] = !{i32 1024, i32 1, i32 1} //. + +// For recording purpose of AMDGPU +// attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" } +// attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" } +// attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" } +// attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" } +// attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" } +// attributes #5 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 5df11a45b4889..de76dd6ab3bb5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -123,15 +123,6 @@ struct AMDGPUPromoteKernelArgumentsPass PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; -ModulePass *createAMDGPULowerKernelAttributesPass(); -void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); -extern char &AMDGPULowerKernelAttributesID; - -struct AMDGPULowerKernelAttributesPass - : PassInfoMixin<AMDGPULowerKernelAttributesPass> { - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; - void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &); extern char &AMDGPULowerModuleLDSLegacyPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 0b2ee6371da06..1f4229a2b15a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,8 +13,14 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/Attributor.h" @@ -50,6 +56,343 @@ static constexpr std::pair<ImplicitArgumentMask, StringLiteral> #include "AMDGPUAttributes.def" }; +// Field offsets in hsa_kernel_dispatch_packet_t. +enum DispatchPackedOffsets { + WORKGROUP_SIZE_X = 4, + WORKGROUP_SIZE_Y = 6, + WORKGROUP_SIZE_Z = 8, + + GRID_SIZE_X = 12, + GRID_SIZE_Y = 16, + GRID_SIZE_Z = 20 +}; + +// Field offsets to implicit kernel argument pointer. +enum ImplicitArgOffsets { + HIDDEN_BLOCK_COUNT_X = 0, + HIDDEN_BLOCK_COUNT_Y = 4, + HIDDEN_BLOCK_COUNT_Z = 8, + + HIDDEN_GROUP_SIZE_X = 12, + HIDDEN_GROUP_SIZE_Y = 14, + HIDDEN_GROUP_SIZE_Z = 16, + + HIDDEN_REMAINDER_X = 18, + HIDDEN_REMAINDER_Y = 20, + HIDDEN_REMAINDER_Z = 22, +}; + +static Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { + auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr + : Intrinsic::amdgcn_dispatch_ptr; + return Intrinsic::getDeclarationIfExists(&M, IntrinsicId); +} + +static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, + uint32_t MaxNumGroups) { + if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) + return; + + if (!Load->getType()->isIntegerTy(32)) + return; + + // TODO: If there is existing range metadata, preserve it if it is stricter. + MDBuilder MDB(Load->getContext()); + MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); + Load->setMetadata(LLVMContext::MD_range, Range); +} + +static bool processUse(CallInst *CI, bool IsV5OrAbove) { + Function *F = CI->getFunction(); + + auto *MD = F->getMetadata("reqd_work_group_size"); + const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; + + const bool HasUniformWorkGroupSize = + F->getFnAttribute("uniform-work-group-size").getValueAsBool(); + + SmallVector<unsigned> MaxNumWorkgroups = + AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", + /*Size=*/3, /*DefaultVal=*/0); + + if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && + !Intrinsic::getDeclarationIfExists(CI->getModule(), + Intrinsic::amdgcn_dispatch_ptr) && + none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) + return false; + + Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + Value *GridSizes[3] = {nullptr, nullptr, nullptr}; + + const DataLayout &DL = F->getDataLayout(); + + // We expect to see several GEP users, casted to the appropriate type and + // loaded. + for (User *U : CI->users()) { + if (!U->hasOneUse()) + continue; + + int64_t Offset = 0; + auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr? + auto *BCI = dyn_cast<BitCastInst>(U); + if (!Load && !BCI) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? + BCI = dyn_cast<BitCastInst>(*U->user_begin()); + } + + if (BCI) { + if (!BCI->hasOneUse()) + continue; + Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI? + } + + if (!Load || !Load->isSimple()) + continue; + + unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); + + // TODO: Handle merged loads. + if (IsV5OrAbove) { // Base is ImplicitArgPtr. + switch (Offset) { + case HIDDEN_BLOCK_COUNT_X: + if (LoadSize == 4) { + BlockCounts[0] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); + } + break; + case HIDDEN_BLOCK_COUNT_Y: + if (LoadSize == 4) { + BlockCounts[1] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); + } + break; + case HIDDEN_BLOCK_COUNT_Z: + if (LoadSize == 4) { + BlockCounts[2] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); + } + break; + case HIDDEN_GROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case HIDDEN_GROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case HIDDEN_GROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case HIDDEN_REMAINDER_X: + if (LoadSize == 2) + Remainders[0] = Load; + break; + case HIDDEN_REMAINDER_Y: + if (LoadSize == 2) + Remainders[1] = Load; + break; + case HIDDEN_REMAINDER_Z: + if (LoadSize == 2) + Remainders[2] = Load; + break; + default: + break; + } + } else { // Base is DispatchPtr. + switch (Offset) { + case WORKGROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case WORKGROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case WORKGROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case GRID_SIZE_X: + if (LoadSize == 4) + GridSizes[0] = Load; + break; + case GRID_SIZE_Y: + if (LoadSize == 4) + GridSizes[1] = Load; + break; + case GRID_SIZE_Z: + if (LoadSize == 4) + GridSizes[2] = Load; + break; + default: + break; + } + } + } + + bool MadeChange = false; + if (IsV5OrAbove && HasUniformWorkGroupSize) { + // Under v5 __ockl_get_local_size returns the value computed by the + // expression: + // + // workgroup_id < hidden_block_count ? hidden_group_size : + // hidden_remainder + // + // For functions with the attribute uniform-work-group-size=true. we can + // evaluate workgroup_id < hidden_block_count as true, and thus + // hidden_group_size is returned for __ockl_get_local_size. + for (int I = 0; I < 3; ++I) { + Value *BlockCount = BlockCounts[I]; + if (!BlockCount) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *ICmp : BlockCount->users()) { + if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin, + m_Specific(BlockCount)))) { + ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } + } + } + + // All remainders should be 0 with uniform work group size. + for (Value *Remainder : Remainders) { + if (!Remainder) + continue; + Remainder->replaceAllUsesWith( + Constant::getNullValue(Remainder->getType())); + MadeChange = true; + } + } else if (HasUniformWorkGroupSize) { // Pre-V5. + // Pattern match the code used to handle partial workgroup dispatches in the + // library implementation of get_local_size, so the entire function can be + // constant folded with a known group size. + // + // uint r =... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/177432 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
