llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> Make s16 G_U/SBFX legal and widen them in RegBankSelect. This allows the set of BFX formation combines to work on s16 types. --- Patch is 95.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131309.diff 7 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+6-3) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+30-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+280-365) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+150-230) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sbfx.mir (+15-11) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ubfx.mir (+16-11) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+6-21) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 6e611ebb4b625..23dd20b51e8e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2068,10 +2068,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .minScalar(0, S32) .lower(); + // Only {S32, S32} or {S32, S64} should ever reach codegen. + // We allow S/UBFX for S16 so the combiner can form them before + // RegBankSelect, and RegBankSelect will then legalize them correctly. getActionDefinitionsBuilder({G_SBFX, G_UBFX}) - .legalFor({{S32, S32}, {S64, S32}}) - .clampScalar(1, S32, S32) - .clampScalar(0, S32, S64) + .legalFor({{S16, S16}, {S32, S32}, {S64, S32}}) + .clampScalar(1, S16, S32) + .clampScalar(0, S16, S64) .widenScalarToNextPow2(0) .scalarize(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 27b86723ce474..ed0d52f6b2441 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1485,7 +1485,9 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, Register DstReg = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(DstReg); + const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; Register SrcReg = MI.getOperand(FirstOpnd).getReg(); @@ -1495,6 +1497,18 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, const RegisterBank *DstBank = OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::VGPRRegBank) { + if (Ty == S16) { + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); + B.setInsertPt(B.getMBB(), MI); + LegalizerHelper Helper(B.getMF(), ApplyBank, B); + + Helper.widenScalarDst(MI, S32); + Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT); + Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_ZEXT); + Helper.widenScalarSrc(MI, S32, 3, AMDGPU::G_ZEXT); + return true; + } + if (Ty == S32) return true; @@ -1554,6 +1568,11 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); + if (Ty == S16) { + OffsetReg = B.buildAnyExtOrTrunc(S32, OffsetReg).getReg(0); + WidthReg = B.buildAnyExtOrTrunc(S32, WidthReg).getReg(0); + } + // Ensure the high bits are clear to insert the offset. auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); @@ -1568,13 +1587,21 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, // TODO: It might be worth using a pseudo here to avoid scc clobber and // register class constraints. - unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : - (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + unsigned Opc = (Ty != S64) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); - auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + Register BFEDst = DstReg; + if (Ty == S16) { + BFEDst = MRI.createGenericVirtualRegister(S32); + MRI.setRegBank(BFEDst, AMDGPU::SGPRRegBank); + } + auto MIB = B.buildInstr(Opc, {BFEDst}, {SrcReg, MergedInputs}); if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) llvm_unreachable("failed to constrain BFE"); + if (BFEDst != DstReg) + B.buildZExtOrTrunc(DstReg, BFEDst); + MI.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 07fcb02d98649..d2b600b04f9fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -40,8 +40,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f -; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -70,8 +69,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f -; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -99,8 +97,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f -; GFX10-NEXT: s_and_b32 s1, s1, 0x7f -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -129,40 +126,38 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f -; GFX11-NEXT: s_and_b32 s1, s1, 0x7f -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x60001 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -205,8 +200,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 6 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v4, v3, -7 @@ -235,8 +229,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 6 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, -7 @@ -264,9 +257,8 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX10-NEXT: v_bfe_u32 v1, v1, 1, 6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v3, -7 @@ -294,38 +286,37 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v1, v1, 1, 6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX11-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX11-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u16 v3, 6, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v3, 0x7f, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -345,10 +336,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -356,10 +347,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -367,10 +358,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX10-LABEL: s_fshl_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -378,10 +369,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX11-LABEL: s_fshl_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -408,11 +399,10 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 1 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -421,11 +411,10 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -433,10 +422,9 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX10-LABEL: v_fshl_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 @@ -446,16 +434,14 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX11-LABEL: v_fshl_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt) @@ -463,42 +449,17 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { } define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { -; GFX6-LABEL: s_fshl_i8_4: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i8_4: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i8_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i8_4: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i8_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -518,37 +479,33 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) { ; GFX8-LABEL: v_fshl_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, 4 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 4, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fs... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/131309 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits