llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-globalisel Author: None (vangthao95) <details> <summary>Changes</summary> Patch 4 of 4 patches to implement full G_MUL support in regbanklegalize. --- Patch is 74.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175889.diff 8 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+23) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+5-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h (+1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+91-42) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+106-101) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll (+195-203) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 1a8bd6d8de261..a60366e5382a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -842,6 +842,27 @@ bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) { return true; } +bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(Dst); + assert(DstTy == S64); + auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg()); + auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg()); + + // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to + // match GlobalISel with old regbankselect. + auto Lo = B.buildMul({VgprRB_S32}, Op1.getReg(0), Op2.getReg(0)); + auto Carry = B.buildUMulH({VgprRB_S32}, Op1.getReg(0), Op2.getReg(0)); + auto MulLo0Hi1 = B.buildMul({VgprRB_S32}, Op1.getReg(0), Op2.getReg(1)); + auto MulHi0Lo1 = B.buildMul({VgprRB_S32}, Op1.getReg(1), Op2.getReg(0)); + auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1); + auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry); + + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; +} + bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); @@ -1006,6 +1027,8 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI, } case SplitTo32: return lowerSplitTo32(MI); + case SplitTo32Mul: + return lowerSplitTo32Mul(MI); case SplitTo32Select: return lowerSplitTo32Select(MI); case SplitTo32SExtInReg: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index f92ed3de6cf27..86669ae6ff6c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -127,6 +127,7 @@ class RegBankLegalizeHelper { bool lowerS_BFE(MachineInstr &MI); bool lowerUniMAD64(MachineInstr &MI); bool lowerSplitTo32(MachineInstr &MI); + bool lowerSplitTo32Mul(MachineInstr &MI); bool lowerSplitTo16(MachineInstr &MI); bool lowerSplitTo32Select(MachineInstr &MI); bool lowerSplitTo32SExtInReg(MachineInstr &MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5a03f6b5463ad..40f298ba51352 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -487,13 +487,17 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}}); + bool HasVecMulU64 = ST->hasVectorMulU64(); addRulesForGOpcs({G_MUL}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}) .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}}) .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}) - .Div(V2S16, {{VgprV2S16}, {VgprV2S16}}); + .Div(V2S16, {{VgprV2S16}, {VgprV2S16}}) + .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64) + .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64); bool hasMulHi = ST->hasScalarMulHiInsts(); addRulesForGOpcs({G_UMULH, G_SMULH}, Standard) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index b5fd6683d319b..ce61e3cb22b9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -229,6 +229,7 @@ enum LoweringMethodID { S_Mul64, S_Mul64Div, SplitTo32, + SplitTo32Mul, ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 2d5585d12b823..991f11809f346 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -783,10 +783,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5] +; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add3_u32 v1, v3, v1, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i64: @@ -1530,14 +1531,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 -; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: s_addc_u32 s28, s35, s28 -; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s25, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s26, 0 +; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_addc_u32 s19, s25, s19 -; GFX7-NEXT: v_mov_b32_e32 v2, s13 ; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s20, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: s_cselect_b32 s20, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 @@ -1613,6 +1618,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_add_u32 s27, s39, s27 ; GFX7-NEXT: s_addc_u32 s25, s40, s25 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s30, 0 +; GFX7-NEXT: s_cselect_b32 s30, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s33, 0 @@ -1622,6 +1629,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: s_addc_u32 s21, s30, s27 ; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s22, 0 +; GFX7-NEXT: s_cselect_b32 s22, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s23, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0 @@ -1751,14 +1760,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: s_addc_u32 s28, s35, s28 -; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_addc_u32 s19, s25, s19 -; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 @@ -1834,6 +1847,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_add_u32 s27, s39, s27 ; GFX8-NEXT: s_addc_u32 s25, s40, s25 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s30, 0 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s33, 0 @@ -1843,6 +1858,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_addc_u32 s21, s30, s27 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s22, 0 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0 @@ -1950,9 +1967,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_add_u32 s19, s34, s19 ; GFX9-NEXT: s_addc_u32 s24, s35, s24 ; GFX9-NEXT: s_cselect_b32 s34, 1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_addc_u32 s19, s22, s19 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, 0 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 @@ -2014,6 +2035,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_add_u32 s24, s39, s24 ; GFX9-NEXT: s_addc_u32 s22, s40, s22 ; GFX9-NEXT: s_cselect_b32 s39, 1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s30, 0 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s31, 0 ; GFX9-NEXT: s_addc_u32 s30, s30, 0 ; GFX9-NEXT: s_cmp_lg_u32 s33, 0 @@ -2023,6 +2046,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_addc_u32 s21, s30, s24 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cselect_b32 s26, 1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 @@ -2129,12 +2154,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18 ; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23 ; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0 ; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14 ; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 ; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14 @@ -2168,12 +2199,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24 ; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21 ; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12 -; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21 ; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0 ; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21 @@ -2193,34 +2222,38 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23 ; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21 ; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s29, 0 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14 +; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0 -; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 +; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0 -; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 +; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 +; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 ; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23 ; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s25, 0 +; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 +; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 ; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21 ; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0 -; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 ; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2 @@ -2308,12 +2341,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s18, s33, s18 ; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23 ; GFX12-NEXT: s_cselect_b32 s33, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s21, 0 +; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX12-NEXT: s_cselect_b32 s21, 1, 0 ; GFX12-NEXT: s_cmp_lg_u32 s22, 0 ; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 ; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 ; GFX12-NEXT: s_cselect_b32 s21, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s19, 0 +; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX12-NEXT: s_cselect_b32 s19, 1, 0 ; GFX12-NEXT: s_cmp_lg_u32 s20, 0 -; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0 ; GFX12-NEXT: s_cmp_lg_u32 s21, 0 ; GFX12-NEXT: s_mul_i32 s21, s0, s14 @@ -2347,12 +2386,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s23, s23, s24 ; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21 ; GFX12-NEXT: s_mul_i32 s34, s1, s12 -; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX12-NEXT: s_cselect_b32 s24, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s34, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21 ; GFX12-NEXT: s_mul_i32 s35, s2, s11 -; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX12-NEXT: s_cselect_b32 s34, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s35, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21 @@ -2372,34 +2409,38 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s23, s38, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21 ; GFX12-NEXT: s_cselect_b32 s38, 1, 0 -; GFX12-NEXT: s_cmp_lg_u32 s30, 0 +; GFX12-NEXT: s_cmp_lg_u32 s29, 0 ; GFX12-NEXT: s_mul_i32 s1, s1, s14 +; GFX12-NEXT: s_cselect_b32 s29, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s30, 0 +; GFX12-NEXT: s_mul_i32 s2, s2, s13 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s31, 0 -; GFX12-NEXT: s_mul_i32 s2, s2, s13 +; GFX12-NEXT: s_mul_i32 s3, s3, s12 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s33, 0 -; GFX12-NEXT: s_mul_i32 s3, s3, s12 +; GFX12-NEXT: s_mul_i32 s4, s4, s11 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s20, 0 -; GFX12-NEXT: s_mul_i32 s4, s4, s11 +; GFX12-NEXT: s_mul_i32 s5, s5, s10 ; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23 ; GFX12-NEXT: s_cselect_b32 s23, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s25, 0 +; GFX12-NEXT: s_mul_i32 s6, s6, s9 +; GFX12-NEXT: s_cselect_b32 s25, 1, 0 ; GFX12-NEXT: s_cmp_lg_u32 s26, 0 ; GFX12-NEXT: s_mul_i32 s26, s0, s15 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s27, 0 -; GFX12-NEXT: s_mul_i32 s5, s5, s10 +; GFX12-NEXT: s_mul_i32 s7, s7, s8 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s28, 0 -; GFX12-NEXT: s_mul_i32 s6, s6, s9 +; GFX12-NEXT: s_mul_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s23, 0 -; GFX12-NEXT: s_mul_i32 s7, s7, s8 ; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21 ; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26 ; GFX12-NEXT: s_cmp_lg_u32 s38, 0 -; GFX12-NEXT: s_mul_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1 ; GFX12-NEXT: s_cmp_lg_u32 s37, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2 @@ -2488,12 +2529,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX1250-NEXT: s_add_co_u32 s18, s33, s18 ; GFX1250-NEXT: s_add_co_ci_u32 s23, s34, s23 ; GFX1250-NEXT: s_cselect_b32 s33, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s21, 0 +; GFX1250-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX1250-NEXT: s_cselect_b32 s21, 1, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s22, 0 ; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s14 ; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18 ; GFX1250-NEXT: s_cselect_b32 s21, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s19, 0 +; GFX1250-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX1250-NEXT: s_cselect_b32 s19, 1, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s20, 0 -; GFX1250-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX1250-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s21, 0 ; GFX1250-NEXT: s_mul_i32 s21, s0, s14 @@ -2527,12 +2574,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX1250-NEXT: s_add_co_u32 s23, s23, s24 ; GFX1250-NEXT: s_add_co_ci_u32 s21, s34, s21 ; GFX1250-NEXT: s_mul_i32 s34, s1, s12 -; GFX1250-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX1250-NEXT: s_cselect_b32 s24, 1, 0 ; GFX1250-NEXT: s_add_co_u32 s23, s34, s23 ; GFX1250-NEXT: s_add_co_ci_u32 s21, s35, s21 ; GFX1250-NEXT: s_mul_i32 s35, s2, s11 -; GFX1250-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX1250-NEXT: s_cselect_b32 s34, 1, 0 ; GFX1250-NEXT: s_add_co_u32 s23, s35, s23 ; GFX1250-NEXT: s_add_co_ci_u32 s21, s36, s21 @@ -2552,34 +2597,38 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX1250-NEXT: s_add_co_u32 s23, s38, s23 ; GFX1250-NEXT: s_add_co_ci_u32 s21, s39, s21 ; GFX1250-NEXT: s_cselect_b32 s38, 1, 0 -; GFX1250-NEXT: s_cmp_lg_u32 s30, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s29, 0 ; GFX1250-NEXT: s_mul_i32 s1, s1, s14 +; GFX1250-NEXT: s_cselect_b32 s29, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s30, 0 +; GFX1250-NEXT: s_mul_i32 s2, s2, s13 ; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s31, 0 -; GFX1250-NEXT: s_mul_i32 s2, s2, s13 +; GFX1250-NEXT: s_mul_i32 s3, s3, s12 ; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s33, 0 -; GFX1250-NEXT: s_mul_i32 s3, s3, s12 +; GFX1250-NEXT: s_mul_i32 s4, s4, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s20, 0 -; GFX1250-NEXT: s_mul_i32 s4, s4, s11 +; GFX1250-NEXT: s_mul_i32 s5, s5, s10 ; GFX1250-NEXT: s_add_co_ci_u32 s20, s29, s23 ; GFX1250-NEXT: s_cselect_b32 s23, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s25, 0 +; GFX1250-NEXT: s_mul_i32 s6, s6, s9 +; GFX1250-NEXT: s_cselect_b32 s25, 1, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s26, 0 ; GFX1250-NEXT: s_mul_i32 s26, s0, s15 ; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s27, 0 -; GFX1250-NEXT: s_mul_i32 s5, s5, s10 +; GFX1250-NEXT: s_mul_i32 s7, s7, s8 ; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s28, 0 -; GFX1250-NEXT: s_mul_i32 s6, s6, s9 +; GFX1250-NEXT: s_mul_i32 s0, s0, s8... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/175889 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
