https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/128494
>From 8568d7b41b3d38f840ccae4a4f12bad8a9bcb060 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Mon, 24 Feb 2025 17:15:53 +0700 Subject: [PATCH 1/5] AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 We should handle this for all the handled readlane and dpp ops. --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 16 ++++++ .../AMDGPU/bitcast-fold-lane-ops.ll | 52 +++++++++---------- .../InstCombine/AMDGPU/permlane64.ll | 6 +-- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index bac3bb5fde7b0..1f56b0bfc86dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1128,9 +1128,25 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { simplifyDemandedLaneMaskArg(IC, II, 1)) return &II; + // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1) + if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) { + Value *BCSrc = BC->getOperand(0); + + // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. + if (isTypeLegal(BCSrc->getType())) { + SmallVector<Value *, 2> Args(II.args()); + Args[0] = BCSrc; + CallInst *NewCall = IC.Builder.CreateIntrinsic( + II.getIntrinsicID(), {BCSrc->getType()}, Args); + NewCall->takeName(&II); + return new BitCastInst(NewCall, II.getType()); + } + } + return std::nullopt; } case Intrinsic::amdgcn_writelane: { + // TODO: Fold bitcast like readlane. if (simplifyDemandedLaneMaskArg(IC, II, 1)) return &II; return std::nullopt; diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll index d4dae239b1e7d..490086afb51b2 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll @@ -4,8 +4,8 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) { ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane( ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32 ; CHECK-NEXT: ret i32 [[RESULT]] ; %bitcast = bitcast float %val to i32 @@ -16,9 +16,9 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) { define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(float %val, ptr %use.ptr) { ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store( ; CHECK-SAME: float [[VAL:%.*]], ptr [[USE_PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 ; CHECK-NEXT: store float [[VAL]], ptr [[USE_PTR]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) +; CHECK-NEXT: [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[RESULT]] to i32 ; CHECK-NEXT: ret i32 [[TMP1]] ; %bitcast = bitcast float %val to i32 @@ -46,9 +46,7 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_call(float %val) { define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) { ; CHECK-LABEL: define float @test_bitcast_f32_to_i32_readfirstlane_bitcast( ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 -; CHECK-NEXT: [[CALL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) -; CHECK-NEXT: [[RESULT:%.*]] = bitcast i32 [[CALL]] to float +; CHECK-NEXT: [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]]) ; CHECK-NEXT: ret float [[RESULT]] ; %bitcast = bitcast float %val to i32 @@ -60,8 +58,8 @@ define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) { define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) { ; CHECK-LABEL: define i32 @test_bitcast_v2f16_to_i32_readfirstlane( ; CHECK-SAME: <2 x half> [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <2 x half> [[VAL]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast <2 x half> [[RESULT1]] to i32 ; CHECK-NEXT: ret i32 [[RESULT]] ; %bitcast = bitcast <2 x half> %val to i32 @@ -72,8 +70,8 @@ define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) { define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) { ; CHECK-LABEL: define i32 @test_bitcast_v2bf16_to_i32_readfirstlane( ; CHECK-SAME: <2 x bfloat> [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <2 x bfloat> [[VAL]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call <2 x bfloat> @llvm.amdgcn.readfirstlane.v2bf16(<2 x bfloat> [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast <2 x bfloat> [[RESULT1]] to i32 ; CHECK-NEXT: ret i32 [[RESULT]] ; %bitcast = bitcast <2 x bfloat> %val to i32 @@ -84,8 +82,8 @@ define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) { define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) { ; CHECK-LABEL: define i64 @test_bitcast_f64_to_i64_readfirstlane( ; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast double [[VAL]] to i64 -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast double [[RESULT1]] to i64 ; CHECK-NEXT: ret i64 [[RESULT]] ; %bitcast = bitcast double %val to i64 @@ -96,8 +94,8 @@ define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) { define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) { ; CHECK-LABEL: define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane( ; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast double [[VAL]] to <2 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast double [[RESULT1]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[RESULT]] ; %bitcast = bitcast double %val to <2 x i32> @@ -108,8 +106,8 @@ define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) { define i64 @test_bitcast_v4i16_to_i64_readfirstlane(<4 x i16> %val) { ; CHECK-LABEL: define i64 @test_bitcast_v4i16_to_i64_readfirstlane( ; CHECK-SAME: <4 x i16> [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <4 x i16> [[VAL]] to i64 -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast <4 x i16> [[RESULT1]] to i64 ; CHECK-NEXT: ret i64 [[RESULT]] ; %bitcast = bitcast <4 x i16> %val to i64 @@ -145,8 +143,8 @@ define i32 @test_bitcast_v8i4_to_i32_readfirstlane(<8 x i4> %val) { define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) { ; CHECK-LABEL: define float @test_bitcast_i32_to_f32_readfirstlane( ; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32 [[VAL]] to float -; CHECK-NEXT: [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast i32 [[RESULT1]] to float ; CHECK-NEXT: ret float [[RESULT]] ; %bitcast = bitcast i32 %val to float @@ -157,8 +155,8 @@ define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) { define i16 @test_bitcast_f16_to_i16_readfirstlane(half %val) { ; CHECK-LABEL: define i16 @test_bitcast_f16_to_i16_readfirstlane( ; CHECK-SAME: half [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast half [[VAL]] to i16 -; CHECK-NEXT: [[RESULT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[BITCAST]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[VAL]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast half [[RESULT1]] to i16 ; CHECK-NEXT: ret i16 [[RESULT]] ; %bitcast = bitcast half %val to i16 @@ -181,8 +179,8 @@ define i16 @test_bitcast_v2i8_to_i16_readfirstlane(<2 x i8> %val) { define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val) { ; CHECK-LABEL: define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane( ; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <16 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> [[BITCAST]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] ; %bitcast = bitcast <16 x float> %val to <16 x i32> @@ -193,8 +191,8 @@ define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val) { ; CHECK-LABEL: define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane( ; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <8 x i64> -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> [[BITCAST]]) +; CHECK-NEXT: [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <8 x i64> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] ; %bitcast = bitcast <16 x float> %val to <8 x i64> @@ -205,8 +203,8 @@ define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val) define i32 @test_bitcast_f32_to_i32_readlane(float %val, i32 inreg %lane.index) { ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane( ; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]]) +; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]]) +; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32 ; CHECK-NEXT: ret i32 [[RESULT]] ; %bitcast = bitcast float %val to i32 diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll index 3908f0b778508..5dd209316d6cb 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll @@ -12,9 +12,9 @@ define i32 @test_constant() { define i32 @test_bitcast_f32_to_i32_permlane64(float %val) { ; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64( -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]]) -; CHECK-NEXT: ret i32 [[RESULT]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]]) +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 +; CHECK-NEXT: ret i32 [[BITCAST]] ; %bitcast = bitcast float %val to i32 %result = call i32 @llvm.amdgcn.permlane64.i32(i32 %bitcast) >From bf1798782624bd60b22d7d866773e0cd5b5b1c75 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 25 Feb 2025 09:09:41 +0700 Subject: [PATCH 2/5] Make sure convergence tokens are preserved --- .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 1f56b0bfc86dc..326bfb2796bf4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1134,12 +1134,14 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. if (isTypeLegal(BCSrc->getType())) { - SmallVector<Value *, 2> Args(II.args()); - Args[0] = BCSrc; - CallInst *NewCall = IC.Builder.CreateIntrinsic( - II.getIntrinsicID(), {BCSrc->getType()}, Args); - NewCall->takeName(&II); - return new BitCastInst(NewCall, II.getType()); + Module *M = IC.Builder.GetInsertBlock()->getModule(); + // Mutate the call in place to ensure operand bundles are preserved. + Function *Remangled = + Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()}); + + II.setCalledFunction(Remangled); + IC.replaceOperand(II, 0, BCSrc); + return new BitCastInst(&II, II.getType()); } } >From 87e44f3f65cd8687c1b4a70eca72641e9fea3ae6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 25 Feb 2025 09:16:00 +0700 Subject: [PATCH 3/5] Revert "Make sure convergence tokens are preserved" This reverts commit bf1798782624bd60b22d7d866773e0cd5b5b1c75. --- .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 326bfb2796bf4..1f56b0bfc86dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1134,14 +1134,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. if (isTypeLegal(BCSrc->getType())) { - Module *M = IC.Builder.GetInsertBlock()->getModule(); - // Mutate the call in place to ensure operand bundles are preserved. - Function *Remangled = - Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()}); - - II.setCalledFunction(Remangled); - IC.replaceOperand(II, 0, BCSrc); - return new BitCastInst(&II, II.getType()); + SmallVector<Value *, 2> Args(II.args()); + Args[0] = BCSrc; + CallInst *NewCall = IC.Builder.CreateIntrinsic( + II.getIntrinsicID(), {BCSrc->getType()}, Args); + NewCall->takeName(&II); + return new BitCastInst(NewCall, II.getType()); } } >From 92bd22e6cf2d239f22a903fb4d634507a9267dea Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 25 Feb 2025 09:18:18 +0700 Subject: [PATCH 4/5] Use bundle guard --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 1f56b0bfc86dc..880a4b9f0d20f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1134,7 +1134,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. if (isTypeLegal(BCSrc->getType())) { - SmallVector<Value *, 2> Args(II.args()); + // Make sure convergence tokens are preserved. + // TODO: CreateIntrinsic should allow directly copying bundles + SmallVector<OperandBundleDef, 2> OpBundles; + II.getOperandBundlesAsDefs(OpBundles); + + IRBuilderBase::OperandBundlesGuard Guard(IC.Builder); + IC.Builder.setDefaultOperandBundles(OpBundles); + + SmallVector<Value *, 3> Args(II.args()); Args[0] = BCSrc; CallInst *NewCall = IC.Builder.CreateIntrinsic( II.getIntrinsicID(), {BCSrc->getType()}, Args); >From a231960a9d8eafdfd129da680dfd36d2a3cf28ff Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 25 Feb 2025 09:22:04 +0700 Subject: [PATCH 5/5] Using CallInst directly actually works --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 11 ++++++----- .../InstCombine/AMDGPU/bitcast-fold-lane-ops.ll | 8 ++++---- llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll | 6 +++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 880a4b9f0d20f..bf7afa6c34b29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1134,18 +1134,19 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. if (isTypeLegal(BCSrc->getType())) { + Module *M = IC.Builder.GetInsertBlock()->getModule(); + Function *Remangled = + Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()}); + // Make sure convergence tokens are preserved. // TODO: CreateIntrinsic should allow directly copying bundles SmallVector<OperandBundleDef, 2> OpBundles; II.getOperandBundlesAsDefs(OpBundles); - IRBuilderBase::OperandBundlesGuard Guard(IC.Builder); - IC.Builder.setDefaultOperandBundles(OpBundles); - SmallVector<Value *, 3> Args(II.args()); Args[0] = BCSrc; - CallInst *NewCall = IC.Builder.CreateIntrinsic( - II.getIntrinsicID(), {BCSrc->getType()}, Args); + + CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles); NewCall->takeName(&II); return new BitCastInst(NewCall, II.getType()); } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll index 490086afb51b2..e458fbd712370 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll @@ -288,8 +288,8 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_convergencetoken(float %val) c ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_convergencetoken( ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry() -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) [ "convergencectrl"(token [[T]]) ] +; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]]) [ "convergencectrl"(token [[T]]) ] +; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32 ; CHECK-NEXT: ret i32 [[RESULT]] ; %t = call token @llvm.experimental.convergence.entry() @@ -302,8 +302,8 @@ define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(float %val, i32 in ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken( ; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry() -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]]) [ "convergencectrl"(token [[T]]) ] +; CHECK-NEXT: [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]]) [ "convergencectrl"(token [[T]]) ] +; CHECK-NEXT: [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32 ; CHECK-NEXT: ret i32 [[RESULT]] ; %t = call token @llvm.experimental.convergence.entry() diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll index 5dd209316d6cb..c480ecf6a8b31 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll @@ -24,9 +24,9 @@ define i32 @test_bitcast_f32_to_i32_permlane64(float %val) { define i32 @test_bitcast_f32_to_i32_permlane64_convergencetokenn(float %val) convergent { ; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64_convergencetokenn( ; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry() -; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32 -; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]]) [ "convergencectrl"(token [[T]]) ] -; CHECK-NEXT: ret i32 [[RESULT]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]]) [ "convergencectrl"(token [[T]]) ] +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 +; CHECK-NEXT: ret i32 [[BITCAST]] ; %t = call token @llvm.experimental.convergence.entry() %bitcast = bitcast float %val to i32 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits