https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/128647
If we are only extracting a single element, rewrite the intrinsic call to use the element type. We should extend this to arbitrary extract shuffles. >From bfe67bc97edc8596cb025bec32bcf685f20d2278 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 25 Feb 2025 09:45:00 +0700 Subject: [PATCH] AMDGPU: Reduce readfirstlane for single demanded vector element If we are only extracting a single element, rewrite the intrinsic call to use the element type. We should extend this to arbitrary extract shuffles. --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 46 ++++++++++++++- .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 6 ++ ...fy-demanded-vector-elts-lane-intrinsics.ll | 57 +++++++++---------- 3 files changed, 78 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 617974713d6f0..99016fdd0ff91 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1538,6 +1538,49 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, return NewCall; } +Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded( + InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, + APInt &UndefElts) const { + auto *VT = dyn_cast<FixedVectorType>(II.getType()); + if (!VT) + return nullptr; + + const unsigned FirstElt = DemandedElts.countr_zero(); + const unsigned LastElt = DemandedElts.getActiveBits() - 1; + const unsigned MaskLen = LastElt - FirstElt + 1; + + // TODO: Handle general subvector extract. + if (MaskLen != 1) + return nullptr; + + Type *EltTy = VT->getElementType(); + if (!isTypeLegal(EltTy)) + return nullptr; + + Value *Src = II.getArgOperand(0); + + assert(FirstElt == LastElt); + Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt); + + // Make sure convergence tokens are preserved. + // TODO: CreateIntrinsic should allow directly copying bundles + SmallVector<OperandBundleDef, 2> OpBundles; + II.getOperandBundlesAsDefs(OpBundles); + + Module *M = IC.Builder.GetInsertBlock()->getModule(); + Function *Remangled = Intrinsic::getOrInsertDeclaration( + M, II.getIntrinsicID(), {Extract->getType()}); + + // TODO: Preserve callsite attributes? + CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles); + + Value *Result = IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()), + NewCall, FirstElt); + IC.replaceInstUsesWith(II, Result); + IC.eraseInstFromFunction(II); + return Result; +} + std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, @@ -1545,9 +1588,8 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( SimplifyAndSetOp) const { switch (II.getIntrinsicID()) { case Intrinsic::amdgcn_readfirstlane: - // TODO: For a vector extract, should reduce the intrinsic call type. SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - return std::nullopt; + return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts); case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index a0d62008d9ddc..f5062070ac6f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -226,6 +226,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; + + Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + const APInt &DemandedElts, + APInt &UndefElts) const; + std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll index 836c739048411..e9d3b5e963b35 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll @@ -4,8 +4,8 @@ define i16 @extract_elt0_v2i16_readfirstlane(<2 x i16> %src) { ; CHECK-LABEL: define i16 @extract_elt0_v2i16_readfirstlane( ; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i16> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[ELT]] ; %vec = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src) @@ -16,8 +16,8 @@ define i16 @extract_elt0_v2i16_readfirstlane(<2 x i16> %src) { define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) { ; CHECK-LABEL: define i16 @extract_elt0_v1i16_readfirstlane( ; CHECK-SAME: <1 x i16> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <1 x i16> @llvm.amdgcn.readfirstlane.v1i16(<1 x i16> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <1 x i16> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i16> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[ELT]] ; %vec = call <1 x i16> @llvm.amdgcn.readfirstlane.v1i16(<1 x i16> %src) @@ -28,8 +28,8 @@ define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) { define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) { ; CHECK-LABEL: define i16 @extract_elt1_v2i16_readfirstlane( ; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i16> [[VEC]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[SRC]], i64 1 +; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[ELT]] ; %vec = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src) @@ -40,8 +40,8 @@ define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) { define i16 @extract_elt0_v4i16_readfirstlane(<4 x i16> %src) { ; CHECK-LABEL: define i16 @extract_elt0_v4i16_readfirstlane( ; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <4 x i16> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[ELT]] ; %vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src) @@ -52,8 +52,8 @@ define i16 @extract_elt0_v4i16_readfirstlane(<4 x i16> %src) { define i16 @extract_elt2_v4i16_readfirstlane(<4 x i16> %src) { ; CHECK-LABEL: define i16 @extract_elt2_v4i16_readfirstlane( ; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <4 x i16> [[VEC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SRC]], i64 2 +; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[ELT]] ; %vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src) @@ -136,8 +136,8 @@ define <2 x i16> @extract_elt30_v4i16_readfirstlane(<4 x i16> %src) { define half @extract_elt0_v2f16_readfirstlane(<2 x half> %src) { ; CHECK-LABEL: define half @extract_elt0_v2f16_readfirstlane( ; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[TMP1]]) ; CHECK-NEXT: ret half [[ELT]] ; %vec = call <2 x half> @llvm.amdgcn.readfirstlane.v2i16(<2 x half> %src) @@ -148,8 +148,8 @@ define half @extract_elt0_v2f16_readfirstlane(<2 x half> %src) { define half @extract_elt1_v2f16_readfirstlane(<2 x half> %src) { ; CHECK-LABEL: define half @extract_elt1_v2f16_readfirstlane( ; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[VEC]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 1 +; CHECK-NEXT: [[ELT:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[TMP1]]) ; CHECK-NEXT: ret half [[ELT]] ; %vec = call <2 x half> @llvm.amdgcn.readfirstlane.v2i16(<2 x half> %src) @@ -186,8 +186,8 @@ define i32 @extract_elt0_nxv4i32_readfirstlane(<vscale x 2 x i32> %src) { define i32 @extract_elt0_v2i32_readfirstlane(<2 x i32> %src) { ; CHECK-LABEL: define i32 @extract_elt0_v2i32_readfirstlane( ; CHECK-SAME: <2 x i32> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i32> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]]) ; CHECK-NEXT: ret i32 [[ELT]] ; %vec = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src) @@ -198,8 +198,8 @@ define i32 @extract_elt0_v2i32_readfirstlane(<2 x i32> %src) { define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(<2 x ptr addrspace(3)> %src) { ; CHECK-LABEL: define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane( ; CHECK-SAME: <2 x ptr addrspace(3)> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v2p3(<2 x ptr addrspace(3)> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x ptr addrspace(3)> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x ptr addrspace(3)> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) [[TMP1]]) ; CHECK-NEXT: ret ptr addrspace(3) [[ELT]] ; %vec = call <2 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v2p3(<2 x ptr addrspace(3)> %src) @@ -210,8 +210,8 @@ define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(<2 x ptr addrspace(3)> define i64 @extract_elt0_v2i64_readfirstlane(<2 x i64> %src) { ; CHECK-LABEL: define i64 @extract_elt0_v2i64_readfirstlane( ; CHECK-SAME: <2 x i64> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i64> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[TMP1]]) ; CHECK-NEXT: ret i64 [[ELT]] ; %vec = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) @@ -222,8 +222,8 @@ define i64 @extract_elt0_v2i64_readfirstlane(<2 x i64> %src) { define i64 @extract_elt1_v2i64_readfirstlane(<2 x i64> %src) { ; CHECK-LABEL: define i64 @extract_elt1_v2i64_readfirstlane( ; CHECK-SAME: <2 x i64> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i64> [[VEC]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[SRC]], i64 1 +; CHECK-NEXT: [[ELT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[TMP1]]) ; CHECK-NEXT: ret i64 [[ELT]] ; %vec = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) @@ -306,9 +306,8 @@ define <2 x i16> @extract_elt13_v4i16readfirstlane(<4 x i16> %src) { define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify0(i32 %src0, i32 %src2) { ; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify0( ; CHECK-SAME: i32 [[SRC0:%.*]], i32 [[SRC2:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[INS_1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 1 -; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]]) -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 poison> +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]]) +; CHECK-NEXT: [[SHUFFLE:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 ; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]] ; %ins.0 = insertelement <4 x i32> poison, i32 %src0, i32 0 @@ -350,8 +349,8 @@ define i32 @extract_elt0_v2i32_readfirstlane_convergencetoken(<2 x i32> %src) co ; CHECK-LABEL: define i32 @extract_elt0_v2i32_readfirstlane_convergencetoken( ; CHECK-SAME: <2 x i32> [[SRC:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry() -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[SRC]]) [ "convergencectrl"(token [[T]]) ] -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i32> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]]) [ "convergencectrl"(token [[T]]) ] ; CHECK-NEXT: ret i32 [[ELT]] ; %t = call token @llvm.experimental.convergence.entry() @@ -381,8 +380,8 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1_convergenc define i1 @extract_elt0_v2i1_readfirstlane(<2 x i1> %src) { ; CHECK-LABEL: define i1 @extract_elt0_v2i1_readfirstlane( ; CHECK-SAME: <2 x i1> [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VEC:%.*]] = call <2 x i1> @llvm.amdgcn.readfirstlane.v2i1(<2 x i1> [[SRC]]) -; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i1> [[VEC]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[SRC]], i64 0 +; CHECK-NEXT: [[ELT:%.*]] = call i1 @llvm.amdgcn.readfirstlane.i1(i1 [[TMP1]]) ; CHECK-NEXT: ret i1 [[ELT]] ; %vec = call <2 x i1> @llvm.amdgcn.readfirstlane.v2i1(<2 x i1> %src) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits