llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> They're relatively safe to use there I believe. The only new registers they may create are the constants for the BFX. For those, borrow the RC from the source register. Fixes #<!-- -->140040 --- Patch is 153.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141590.diff 9 Files Affected: - (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+29) - (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+56-63) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+484-541) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+458-506) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+13-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+111-121) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+172-182) - (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+8-9) ``````````diff diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b1e851183de0d..8981b13dac7ed 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4629,10 +4629,17 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg( if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits()) return false; + const RegisterBank *RB = getRegBank(ShiftSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto Cst1 = B.buildConstant(ExtractTy, ShiftImm); auto Cst2 = B.buildConstant(ExtractTy, Width); B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2); + + if (RB) { + MRI.setRegBank(Cst1.getReg(0), *RB); + MRI.setRegBank(Cst2.getReg(0), *RB); + } }; return true; } @@ -4667,10 +4674,18 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI, return false; uint64_t Width = APInt(Size, AndImm).countr_one(); + + const RegisterBank *RB = getRegBank(ShiftSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto LSBCst = B.buildConstant(ExtractTy, LSBImm); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst}); + + if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(LSBCst.getReg(0), *RB); + } }; return true; } @@ -4717,10 +4732,17 @@ bool CombinerHelper::matchBitfieldExtractFromShr( const int64_t Pos = ShrAmt - ShlAmt; const int64_t Width = Size - ShrAmt; + const RegisterBank *RB = getRegBank(ShlSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst}); + + if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(PosCst.getReg(0), *RB); + } }; return true; } @@ -4775,10 +4797,17 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd( if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size) return false; + const RegisterBank *RB = getRegBank(AndSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst}); + + if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(PosCst.getReg(0), *RB); + } }; return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 94e1175b06b14..96be17c487130 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,5 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index ff03cf1231d08..b0a239bef649e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -811,16 +811,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s2, s1, 16 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_v2i16: @@ -1014,26 +1013,24 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_lshr_b32 s4, s2, 16 +; GFX8-NEXT: s_sext_i32_i16 s6, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_lshr_b32 s5, s3, 16 +; GFX8-NEXT: s_ashr_i32 s0, s0, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_ashr_i32 s2, s6, s2 +; GFX8-NEXT: s_ashr_i32 s1, s1, s5 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_ashr_i32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_v4i16: @@ -1223,46 +1220,42 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_ashr_i32 s4, s4, s12 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s9 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_ashr_i32 s5, s5, s13 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_lshr_b32 s9, s5, 16 +; GFX8-NEXT: s_ashr_i32 s0, s0, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_lshr_b32 s10, s6, 16 +; GFX8-NEXT: s_ashr_i32 s4, s12, s4 +; GFX8-NEXT: s_ashr_i32 s5, s8, s5 +; GFX8-NEXT: s_ashr_i32 s1, s1, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_sext_i32_i16 s6, s10 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_ashr_i32 s6, s6, s14 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_lshr_b32 s11, s7, 16 +; GFX8-NEXT: s_ashr_i32 s6, s8, s6 +; GFX8-NEXT: s_ashr_i32 s2, s2, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 -; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_ashr_i32 s7, s7, s15 +; GFX8-NEXT: s_ashr_i32 s3, s3, s11 +; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_ashr_i32 s7, s8, s7 +; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 768a4d039aef9..7077029747c84 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -40,8 +40,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f -; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -70,8 +69,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f -; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -99,8 +97,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f -; GFX10-NEXT: s_and_b32 s1, s1, 0x7f -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -129,40 +126,38 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f -; GFX11-NEXT: s_and_b32 s1, s1, 0x7f -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x60001 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -345,10 +340,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -356,10 +351,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -367,10 +362,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX10-LABEL: s_fshl_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -378,10 +373,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX11-LABEL: s_fshl_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -463,42 +458,17 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { } define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshl_i8_4: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i8_4: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i8_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i8_4: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i8_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -556,42 +526,17 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) { } define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshl_i8_5: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 5 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x50003 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i8_5: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshr_b32 s1, s1, 3 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i8_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 5 -; GFX9-NEXT: s_lshr_b32 s1, s1, 3 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i8_5: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 5 -; GFX10-NEXT: s_lshr_b32 s1, s1, 3 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i8_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 5 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x50003 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_lshr_b32 s1, s1, 3 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x50003 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -674,23 +619,23 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX8-LABEL: s_fshl_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshr_b32 s5, s2, 8 -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s5, s2, 7 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_and_b32 s2, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_andn2_b32 s3, 7, s5 -; GFX8-NEXT: s_lshr_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x70001 +; GFX8-NEXT: s_lshr_b32 s4, s2, 8 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s2, s5, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/141590 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits