https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141591
>From 7c8f90225928c0dbffcfa03bd20da3419a80095f Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Tue, 27 May 2025 12:29:02 +0200 Subject: [PATCH 1/2] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 59 ++++++++--------- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++++++----------- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++++++------------ llvm/test/CodeGen/AMDGPU/div_i128.ll | 30 ++++----- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 11 ++-- llvm/test/CodeGen/AMDGPU/lround.ll | 18 +++--- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 16 +---- 8 files changed, 104 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, 1 +; GFX6-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, 1 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v3, 1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index f317526e6de47..de7e7c7de38b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -80,11 +80,10 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_min_i32 s2, s2, 0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9 ; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_max_i32 s1, s2, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s3 ; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, s1 @@ -189,11 +188,10 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_min_i32 s2, s2, 0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_max_i32 s1, s2, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s3 ; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, s1 @@ -386,11 +384,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s1, s4, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_add_i32 s0, s0, s1 @@ -400,11 +397,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s4, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s2, s3, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -787,11 +783,10 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sub_i32 s8, 0x8000, s8 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s1, s8, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s8, s9 ; GFX8-NEXT: s_min_i32 s1, s1, s8 ; GFX8-NEXT: s_add_i32 s0, s0, s1 @@ -801,11 +796,10 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s8, s5, 0 ; GFX8-NEXT: s_min_i32 s5, s5, 0 ; GFX8-NEXT: s_sub_i32 s5, 0x8000, s5 +; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 ; GFX8-NEXT: s_max_i32 s2, s5, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s8 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -815,11 +809,10 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s6, s5, 0 ; GFX8-NEXT: s_min_i32 s5, s5, 0 ; GFX8-NEXT: s_sub_i32 s5, 0x8000, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s3, s5, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s3 @@ -829,14 +822,13 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_min_i32 s5, s5, 0 ; GFX8-NEXT: s_lshl_b32 s4, s7, 8 ; GFX8-NEXT: s_sub_i32 s5, 0x8000, s5 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 -; GFX8-NEXT: s_max_i32 s4, s5, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -2631,11 +2623,10 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-NEXT: s_max_i32 s3, s2, 0 ; GFX8-NEXT: s_min_i32 s2, s2, 0 ; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_max_i32 s1, s2, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s3 ; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_add_i32 s0, s0, s1 @@ -2835,11 +2826,10 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_max_i32 s4, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_min_i32 s3, s3, s4 @@ -3190,11 +3180,10 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_max_i32 s7, s6, 0 ; GFX8-NEXT: s_min_i32 s6, s6, 0 ; GFX8-NEXT: s_sub_i32 s6, 0x8000, s6 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s2 -; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 ; GFX8-NEXT: s_max_i32 s6, s6, s8 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_min_i32 s6, s6, s7 @@ -3215,11 +3204,10 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_max_i32 s6, s2, 0 ; GFX8-NEXT: s_min_i32 s2, s2, 0 ; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s7, s3 -; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s2, s2, s7 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_min_i32 s2, s2, s6 @@ -3513,11 +3501,10 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_max_i32 s10, s9, 0 ; GFX8-NEXT: s_min_i32 s9, s9, 0 ; GFX8-NEXT: s_sub_i32 s9, 0x8000, s9 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s11, s3 -; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 ; GFX8-NEXT: s_max_i32 s9, s9, s11 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_min_i32 s9, s9, s10 @@ -3538,11 +3525,10 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_max_i32 s9, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s10, s4 -; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s3, s3, s10 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_min_i32 s3, s3, s9 @@ -3563,11 +3549,10 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_max_i32 s4, s3, 0 ; GFX8-NEXT: s_min_i32 s3, s3, 0 ; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s5 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s9 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_min_i32 s3, s3, s4 @@ -3924,11 +3909,10 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s13, s12, 0 ; GFX8-NEXT: s_min_i32 s12, s12, 0 ; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12 +; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s14, s4 -; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13 ; GFX8-NEXT: s_max_i32 s12, s12, s14 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s13, s13 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_min_i32 s12, s12, s13 @@ -3949,11 +3933,10 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s12, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s13, s5 -; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s4, s4, s13 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s12 @@ -3974,11 +3957,10 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s5, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s6 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s12 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s5 @@ -3999,11 +3981,10 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s5, s4, 0 ; GFX8-NEXT: s_min_i32 s4, s4, 0 ; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 6873c9e6b9b4e..9028cc2e44995 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -80,11 +80,10 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s1, s1, 9 ; GFX8-NEXT: s_add_i32 s3, s3, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, -1 +; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -189,11 +188,10 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_add_i32 s3, s3, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, -1 +; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -387,11 +385,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_add_i32 s5, s5, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, -1 +; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s5, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -401,11 +398,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s4, s3, -1 ; GFX8-NEXT: s_add_i32 s4, s4, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, -1 +; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s4, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -788,11 +784,10 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 ; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s9, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_min_i32 s1, s1, s8 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -802,11 +797,10 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s8, s5, -1 ; GFX8-NEXT: s_add_i32 s8, s8, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, -1 +; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s8, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -816,11 +810,10 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s6, s5, -1 ; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, -1 +; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 @@ -830,14 +823,13 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s4, s7, 8 ; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, -1 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_add_i32 s5, s5, 0x8000 -; GFX8-NEXT: s_max_i32 s4, s6, s4 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -2634,11 +2626,10 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-NEXT: s_max_i32 s3, s2, -1 ; GFX8-NEXT: s_add_i32 s3, s3, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, -1 +; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_add_i32 s2, s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -2839,11 +2830,10 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_max_i32 s4, s3, -1 ; GFX8-NEXT: s_add_i32 s4, s4, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, -1 +; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_add_i32 s3, s3, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s4, s5 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_min_i32 s3, s4, s3 @@ -3196,11 +3186,10 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_max_i32 s7, s6, -1 ; GFX8-NEXT: s_add_i32 s7, s7, 0x8001 ; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s2 -; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_max_i32 s7, s7, s8 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_min_i32 s6, s7, s6 @@ -3221,11 +3210,10 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_max_i32 s6, s4, -1 ; GFX8-NEXT: s_add_i32 s6, s6, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, -1 +; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s3 -; GFX8-NEXT: s_add_i32 s4, s4, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_min_i32 s4, s6, s4 @@ -3519,11 +3507,10 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_max_i32 s10, s9, -1 ; GFX8-NEXT: s_add_i32 s10, s10, 0x8001 ; GFX8-NEXT: s_min_i32 s9, s9, -1 +; GFX8-NEXT: s_add_i32 s9, s9, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s11, s3 -; GFX8-NEXT: s_add_i32 s9, s9, 0x8000 ; GFX8-NEXT: s_max_i32 s10, s10, s11 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_min_i32 s9, s10, s9 @@ -3544,11 +3531,10 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_max_i32 s9, s6, -1 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 ; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s4 -; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_max_i32 s9, s9, s10 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_min_i32 s6, s9, s6 @@ -3569,11 +3555,10 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_max_i32 s7, s6, -1 ; GFX8-NEXT: s_add_i32 s7, s7, 0x8001 ; GFX8-NEXT: s_min_i32 s6, s6, -1 +; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s9, s5 -; GFX8-NEXT: s_add_i32 s6, s6, 0x8000 ; GFX8-NEXT: s_max_i32 s7, s7, s9 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_min_i32 s6, s7, s6 @@ -3930,11 +3915,10 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s13, s12, -1 ; GFX8-NEXT: s_add_i32 s13, s13, 0x8001 ; GFX8-NEXT: s_min_i32 s12, s12, -1 +; GFX8-NEXT: s_add_i32 s12, s12, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s13, s13 ; GFX8-NEXT: s_sext_i32_i16 s14, s4 -; GFX8-NEXT: s_add_i32 s12, s12, 0x8000 ; GFX8-NEXT: s_max_i32 s13, s13, s14 -; GFX8-NEXT: s_sext_i32_i16 s13, s13 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_min_i32 s12, s13, s12 @@ -3955,11 +3939,10 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s12, s8, -1 ; GFX8-NEXT: s_add_i32 s12, s12, 0x8001 ; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s13, s5 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_max_i32 s12, s12, s13 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 ; GFX8-NEXT: s_min_i32 s8, s12, s8 @@ -3980,11 +3963,10 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_max_i32 s9, s8, -1 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 ; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s12, s6 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_max_i32 s9, s9, s12 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_min_i32 s8, s9, s8 @@ -4003,13 +3985,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_max_i32 s9, s8, -1 ; GFX8-NEXT: s_add_i32 s9, s9, 0x8001 -; GFX8-NEXT: s_sub_i32 s6, s10, s6 ; GFX8-NEXT: s_min_i32 s8, s8, -1 +; GFX8-NEXT: s_sub_i32 s6, s10, s6 +; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s7 -; GFX8-NEXT: s_add_i32 s8, s8, 0x8000 ; GFX8-NEXT: s_max_i32 s9, s9, s10 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 ; GFX8-NEXT: s_min_i32 s8, s9, s8 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 06c0417211809..1e82b54c4031e 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -4501,21 +4501,19 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v5 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v0, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v2, v0, s[6:7] +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v0, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v3, v0, s[6:7] -; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-G-O0-NEXT: s_mov_b32 s5, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3 -; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v1, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4 @@ -4590,14 +4588,12 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4 -; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v1, v[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index c316ec71863d0..968471287dc4d 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -797,12 +797,11 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB2_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0x3ff00000 -; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff +; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6 ; GISEL-NEXT: v_lshl_add_u32 v2, v8, 20, v2 -; GISEL-NEXT: v_and_or_b32 v1, v10, v3, v1 -; GISEL-NEXT: v_or3_b32 v1, v1, v2, 0 +; GISEL-NEXT: v_and_b32_e32 v3, 0xfffff, v10 +; GISEL-NEXT: v_or3_b32 v1, v3, v1, v2 ; GISEL-NEXT: .LBB2_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1081,8 +1080,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 ; GISEL-NEXT: v_lshl_add_u32 v0, v7, 20, v0 -; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 -; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0xfffff +; GISEL-NEXT: v_and_or_b32 v5, v9, v1, v0 ; GISEL-NEXT: .LBB3_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 8036e32f90eb0..4c0774194d78f 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -116,7 +116,7 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; GFX9-GISEL-NEXT: s_brev_b32 s4, 1 ; GFX9-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s4, v4 @@ -142,7 +142,7 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v4 @@ -172,7 +172,7 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX11-GISEL-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 @@ -374,7 +374,7 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; GFX9-GISEL-NEXT: s_brev_b32 s4, 1 ; GFX9-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s4, v4 @@ -414,7 +414,7 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v4 @@ -456,7 +456,7 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX11-GISEL-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 @@ -665,7 +665,7 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; GFX9-GISEL-NEXT: s_brev_b32 s4, 1 ; GFX9-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s4, v4 @@ -705,7 +705,7 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v4 @@ -747,7 +747,7 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX11-GISEL-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-GISEL-NEXT: v_and_or_b32 v0, v0, 0, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 79c4cda2eeaef..88ac211453daa 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -246,12 +246,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 ; GISEL-VI-NEXT: s_max_i32 s3, s3, 0 ; GISEL-VI-NEXT: s_max_i32 s2, s2, 0 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 ; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff ; GISEL-VI-NEXT: s_min_i32 s2, s2, 0xff -; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 -; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 ; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 ; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -269,8 +265,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-GFX9-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX9-NEXT: s_max_i32 s3, s3, 0 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff ; GISEL-GFX9-NEXT: s_min_i32 s3, s3, 0xff ; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 @@ -287,8 +281,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX11-NEXT: s_max_i32 s3, s3, 0 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-GFX11-NEXT: s_min_i32 s2, s2, 0xff ; GISEL-GFX11-NEXT: s_min_i32 s3, s3, 0xff ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -306,8 +298,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-GFX12-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX12-NEXT: s_max_i32 s3, s3, 0 -; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-GFX12-NEXT: s_min_i32 s2, s2, 0xff ; GISEL-GFX12-NEXT: s_min_i32 s3, s3, 0xff ; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -745,13 +735,11 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-VI-NEXT: s_sext_i32_i16 s3, s2 ; GISEL-VI-NEXT: s_bfe_i32 s2, s2, 0x100010 ; GISEL-VI-NEXT: s_max_i32 s2, s2, 0 -; GISEL-VI-NEXT: s_max_i32 s3, s3, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-VI-NEXT: s_min_i32 s2, s2, 0xff -; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff +; GISEL-VI-NEXT: s_max_i32 s3, s3, 0 ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 -; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff ; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 ; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 >From 39ae19d11123de1b9f35f525dc9d0299eedca60a Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Wed, 28 May 2025 11:12:27 +0200 Subject: [PATCH 2/2] Remove duplicate combines --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index df867aaa204b1..759d0b077bf21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -205,10 +205,9 @@ def AMDGPUPostLegalizerCombiner: GICombiner< def AMDGPURegBankCombiner : GICombiner< "AMDGPURegBankCombinerImpl", - [unmerge_merge, unmerge_cst, unmerge_undef, - zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, - identity_combines, redundant_and, constant_fold_cast_op, + [unmerge_merge, unmerge_cst, unmerge_undef, int_minmax_to_med3, + ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, + fmed3_intrinsic_to_clamp, identity_combines, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, known_bits_simplifications]> { _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits