https://github.com/linuxrocks123 created https://github.com/llvm/llvm-project/pull/164847
This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Blang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern. >From ddda6473ab7ae8485a906a749eebad0853b857ca Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 11:50:32 -0500 Subject: [PATCH] Initial work --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++ .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++- 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8428fa97fe445..f17156f8a24ab 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") + TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9e334d4316336..50b43a1c927ce 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_amdgcn_bcnt032_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + +def int_amdgcn_bcnt064_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..39b558694edf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -35,6 +36,7 @@ #include "llvm/Support/KnownFPClass.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include <cstdint> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt<bool> + DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", + cl::desc("Prevent transforming bitsin(typeof(x)) - " + "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); + class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); + bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::ctpop: + return visitCtpop(I); default: return false; } @@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } +bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { + uint32_t BitWidth, DestinationWidth, IntrinsicWidth; + if (!I.hasOneUse() || + !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + return false; + + BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back()); + if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + return false; + + ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0)); + if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + return false; + + IRBuilder<> Builder(MustBeSub); + Instruction *TransformedIns = + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo + : Intrinsic::amdgcn_bcnt032_lo, + {}, {I.getArgOperand(0)}); + + if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != + (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) + TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + + MustBeSub->replaceAllUsesWith(TransformedIns); + TransformedIns->takeName(MustBeSub); + MustBeSub->eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b621fe78..29104d33a8aa8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", } // End isReMaterializable = 1, isAsCheapAsAMove = 1 let Defs = [SCC] in { -def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; +def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))] +>; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))] +>; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
