https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122460
>From ea6a8ce50fbd64222bd897080eed662aaba15e43 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 10 Jan 2025 14:57:24 +0700 Subject: [PATCH] AMDGPU: Implement isExtractVecEltCheap Once again we have excessive TLI hooks with bad defaults. Permit this for 32-bit element vectors, which are just use-different-register. We should permit 16-bit vectors as cheap with legal packed instructions, but I see some mixed improvements and regressions that need investigation. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +++++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/test/CodeGen/AMDGPU/mad-mix.ll | 12 ++++----- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 32 +++++++++++++++++++---- llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 9 ++++--- 5 files changed, 45 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 69dca988b2cad9..88f84d2b2ced47 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1949,6 +1949,13 @@ bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, return Index == 0; } +bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const { + // TODO: This should be more aggressive, particular for 16-bit element + // vectors. However there are some mixed improvements and regressions. + EVT EltTy = VT.getVectorElementType(); + return EltTy.getSizeInBits() % 32 == 0; +} + bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { if (Subtarget->has16BitInsts() && VT == MVT::i16) { switch (Op) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 5c215f76552d9c..bbb96d9115a0a9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -365,6 +365,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override; bool isTypeDesirableForOp(unsigned Op, EVT VT) const override; diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index b520dd1060ec8c..30e3bc3ba5da85 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -385,17 +385,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v4, v2, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SDAG-CI-NEXT: v_mad_f32 v0, v1, v2, v5 +; SDAG-CI-NEXT: v_mad_f32 v1, v4, v3, v5 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle: diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 6b7eff316fe95b..0833dada43e4d5 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -549,17 +549,19 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_fadd_fsub: +; GCN-LABEL: {{^}}fadd_fadd_fsub_0: ; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}} + +; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} + ; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} ; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} -define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) { +define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { bb: %i12 = fadd <2 x float> zeroinitializer, %arg - %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef> + %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> %i13 = fadd <2 x float> zeroinitializer, %shift8 %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2> %i15 = fsub <2 x float> %i14, zeroinitializer @@ -567,6 +569,26 @@ bb: ret void } +; GCN-LABEL: {{^}}fadd_fadd_fsub: +; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} + +; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} + +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) { +bb: + %i12 = fadd <2 x float> %arg, %arg1 + %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> + %i13 = fadd <2 x float> %arg1, %shift8 + %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2> + %i15 = fsub <2 x float> %i14, %arg1 + store <2 x float> %i15, ptr addrspace(1) %ptr + ret void +} + ; GCN-LABEL: {{^}}fadd_shuffle_v4: ; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index 02e30b6c68e994..af2af23640cee5 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -184,6 +184,7 @@ define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression: @@ -300,14 +301,15 @@ define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %a ; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0xffff +; SI-NEXT: v_or_b32_e32 v0, 17, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 +; VI-NEXT: v_or_b32_e32 v0, 17, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17) @@ -368,7 +370,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v0, 18 -; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits