llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Mariusz Sikora (mariusz-sikora-at-amd) <details> <summary>Changes</summary> --- Patch is 177.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/197141.diff 9 Files Affected: - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+12) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+9-9) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+12-12) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+16-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+4-1) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+25-8) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+8-6) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll (+2908-60) ``````````diff diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index cfad312d7535a..751cd9847bd31 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -2205,6 +2205,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case Builtin::BI__builtin_scalbn: return emitBinaryExpMaybeConstrainedFPBuiltin( *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp); + case AMDGPU::BI__builtin_amdgcn_permlane_bcast: + return emitBuiltinWithOneOverloadedType<3>( + *this, E, Intrinsic::amdgcn_permlane_bcast); + case AMDGPU::BI__builtin_amdgcn_permlane_up: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_permlane_up); + case AMDGPU::BI__builtin_amdgcn_permlane_down: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_permlane_down); + case AMDGPU::BI__builtin_amdgcn_permlane_xor: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_permlane_xor); default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 0b4cdd0c2c28f..f0531bef642b0 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -1296,7 +1296,7 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1322,7 +1322,7 @@ void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1348,7 +1348,7 @@ void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1374,7 +1374,7 @@ void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1514,7 +1514,7 @@ void test_s_wakeup_barrier(void *bar) // CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]] // CHECK-NEXT: ret float [[TMP2]] // float test_global_add_f32(global float *addr, float x) { @@ -1531,7 +1531,7 @@ float test_global_add_f32(global float *addr, float x) { // CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: ret <2 x half> [[TMP2]] // half2 test_global_add_half2(global half2 *addr, half2 x) { @@ -1548,7 +1548,7 @@ half2 test_global_add_half2(global half2 *addr, half2 x) { // CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: ret <2 x half> [[TMP2]] // half2 test_flat_add_2f16(generic half2 *addr, half2 x) { @@ -1566,7 +1566,7 @@ half2 test_flat_add_2f16(generic half2 *addr, half2 x) { // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> -// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> // CHECK-NEXT: ret <2 x i16> [[TMP4]] // @@ -1585,7 +1585,7 @@ short2 test_flat_add_2bf16(generic short2 *addr, short2 x) { // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> -// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> // CHECK-NEXT: ret <2 x i16> [[TMP4]] // diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 8631985de9a0a..63920e91ffcaf 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3763,27 +3763,27 @@ def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">, PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty]>; // llvm.amdgcn.permlane.bcast <src0> <src1> <src2> -def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_bcast : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.up <src0> <src1> <src2> -def int_amdgcn_permlane_up : ClangBuiltin<"__builtin_amdgcn_permlane_up">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_up : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.down <src0> <src1> <src2> -def int_amdgcn_permlane_down : ClangBuiltin<"__builtin_amdgcn_permlane_down">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_down : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.xor <src0> <src1> <src2> -def int_amdgcn_permlane_xor : ClangBuiltin<"__builtin_amdgcn_permlane_xor">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_xor : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.idx.gen <src0> <src1> diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 43db1ead84c80..2c21fd1209a02 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6126,6 +6126,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, IID == Intrinsic::amdgcn_permlanex16; bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || IID == Intrinsic::amdgcn_set_inactive_chain_arg; + bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast || + IID == Intrinsic::amdgcn_permlane_up || + IID == Intrinsic::amdgcn_permlane_down || + IID == Intrinsic::amdgcn_permlane_xor; auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, Register Src2, LLT VT) -> Register { @@ -6139,6 +6143,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, case Intrinsic::amdgcn_set_inactive_chain_arg: return LaneOp.addUse(Src1).getReg(0); case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: return LaneOp.addUse(Src1).addUse(Src2).getReg(0); case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: { @@ -6170,9 +6178,11 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, Register Src0 = MI.getOperand(2).getReg(); Register Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { + IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 || + IsPermlaneShuffle) { Src1 = MI.getOperand(3).getReg(); - if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 || + IsPermlaneShuffle) { Src2 = MI.getOperand(4).getReg(); } } @@ -8447,6 +8457,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_update_dpp: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 061b8dc070ead..c1b0e72a8c42d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1691,7 +1691,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, Standard) .Div(S32, {{Vgpr32}, - {IntrId, Vgpr32, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}}); + {IntrId, Vgpr32, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}}) + .Div(V2S16, + {{VgprV2S16}, + {IntrId, VgprV2S16, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}}); addRulesForIOpcs({amdgcn_permlane_idx_gen}, Standard) .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, SgprB32_ReadFirstLane}}); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8b89366f89c5a..2de3dbd25d52a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7859,6 +7859,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, IID == Intrinsic::amdgcn_permlanex16; bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || IID == Intrinsic::amdgcn_set_inactive_chain_arg; + bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast || + IID == Intrinsic::amdgcn_permlane_up || + IID == Intrinsic::amdgcn_permlane_down || + IID == Intrinsic::amdgcn_permlane_xor; SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); const GCNSubtarget *ST = TLI.getSubtarget(); @@ -7880,6 +7884,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Operands.push_back(N->getOperand(4)); [[fallthrough]]; case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: Operands.push_back(Src2); [[fallthrough]]; case Intrinsic::amdgcn_readlane: @@ -7913,10 +7921,12 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || IID == Intrinsic::amdgcn_mov_dpp8 || - IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { + IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 || + IsPermlaneShuffle) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || - IID == Intrinsic::amdgcn_update_dpp || IsPermLane16) + IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 || + IsPermlaneShuffle) Src2 = N->getOperand(3); } @@ -8011,18 +8021,21 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(EltIdx, SL, MVT::i32)); if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || - IsPermLane16) + IsPermLane16) { Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, DAG.getConstant(EltIdx, SL, MVT::i32)); - if (IID == Intrinsic::amdgcn_writelane) + Pieces.push_back( + createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)); + } else if (IID == Intrinsic::amdgcn_writelane) { Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, DAG.getConstant(EltIdx, SL, MVT::i32)); + Pieces.push_back( + createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); + } else { + Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT)); + } - Pieces.push_back( - IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 - ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) - : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); EltIdx += SubVecNumElt; } return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); @@ -11144,6 +11157,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_update_dpp: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: return lowerLaneOp(*this, Op.getNode(), DAG); case Intrinsic::amdgcn_dead: { SmallVector<SDValue, 8> Poisons; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index e78ecd1af9b45..d198a591c148a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1205,8 +1205,8 @@ class PermlaneVarPat<SDPatternOperator permlane, >; class PermlaneNoDppPat3Src<SDPatternOperator permlane, - Instruction inst> : GCNPat< - (permlane i32:$src0, i32:$src1, i32:$src2), + Instruction inst, ValueType vt> : GCNPat< + (vt (permlane vt:$src0, i32:$src1, i32:$src2)), (inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2) >; @@ -1611,10 +1611,12 @@ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in { defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32>>; } // End isConvergent = 1 - def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast, V_PERMLANE_BCAST_B32_e64>; - def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up, V_PERMLANE_UP_B32_e64>; - def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down, V_PERMLANE_DOWN_B32_e64>; - def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor, V_PERMLANE_XOR_B32_e64>; + foreach vt = Reg32Types.types in { + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast, V_PERMLANE_BCAST_B32_e64, vt>; + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up, V_PERMLANE_UP_B32_e64, vt>; + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down, V_PERMLANE_DOWN_B32_e64, vt>; + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor, V_PERMLANE_XOR_B32_e64, vt>; + } def : PermlaneNoDppPat2Src<int_amdgcn_permlane_idx_gen, V_PERMLANE_IDX_GEN_B32_e64>; } // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 68663ae820b57..9c57f1f2e5367 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -802,28 +802,28 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i ret void } -; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.bcast.i32(... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/197141 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
