https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/128123
None >From da446d3f28f3b38d4e36a70ad9e1973f7ad9e707 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 28 May 2024 12:59:41 +0200 Subject: [PATCH] AMDGPU: Form v2f16 minimum3/maximum3 on gfx950 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 4 +- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 175 ++++++-------------- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 175 ++++++-------------- 4 files changed, 112 insertions(+), 245 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0b13a53a0c989..6ed09253c51e1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13515,7 +13515,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, case ISD::FMINIMUM: case ISD::FMAXIMUM: return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || - (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()); + (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) || + (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16()); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index d5c6e8af109f4..85c047167f1e1 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -145,8 +145,8 @@ def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; } // End SubtargetPredicate = HasVOP3PInsts let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in { -defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; -defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; +defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>; +defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>; } // TODO: Make sure we're doing the right thing with denormals. Note diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index f228824ff750e..2a372dffce650 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1269,9 +1269,7 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) { ; GFX950-LABEL: v_fmaximum3_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1306,9 +1304,7 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) { ; GFX950-LABEL: v_fmaximum3_f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %c, half %max0) @@ -1346,10 +1342,9 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; ; GFX950-LABEL: s_fmaximum3_f16: ; GFX950: ; %bb.0: -; GFX950-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2 +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, s0, v0, v1 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX950-NEXT: s_nop 0 @@ -1392,9 +1387,7 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b) @@ -1431,9 +1424,7 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs) @@ -1470,9 +1461,7 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1511,9 +1500,7 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1554,9 +1541,7 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %b.fneg = fneg half %b @@ -1597,9 +1582,7 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1641,9 +1624,7 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b) @@ -1680,9 +1661,7 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg) @@ -1719,9 +1698,7 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1758,9 +1735,7 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_movk_i32 s0, 0x4800 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 8.0, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1795,9 +1770,8 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) { ; GFX950-LABEL: v_fmaximum3_f16__const2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_movk_i32 s0, 0x4800 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, s0 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half 8.0) @@ -1832,9 +1806,7 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) { ; GFX950-LABEL: v_fmaximum3_f16_inlineimm0: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 4.0, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1869,9 +1841,7 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) { ; GFX950-LABEL: v_fmaximum3_f16__inlineimm: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, 4.0 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half 4.0) @@ -1909,9 +1879,8 @@ define half @v_fmaximum3_f16_const1_const2(half %a) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_movk_i32 s0, 0x4800 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 -; GFX950-NEXT: s_movk_i32 s0, 0x4c00 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, 0x4c00 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half 8.0) %max1 = call half @llvm.maximum.f16(half %max0, half 16.0) @@ -1959,9 +1928,7 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX950-LABEL: v_fmaximum3_v2f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0) @@ -2009,9 +1976,7 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX950-LABEL: v_fmaximum3_v2f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -2067,9 +2032,7 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) @@ -2120,9 +2083,7 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX950-LABEL: v_fmaximum3_v2f16__fneg_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b @@ -2173,9 +2134,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -2223,9 +2182,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, 4.0 op_sel_hi:[1,1,0] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>) @@ -2287,10 +2244,8 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX950-LABEL: v_fmaximum3_v3f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0) @@ -2352,10 +2307,8 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX950-LABEL: v_fmaximum3_v3f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -2428,16 +2381,14 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 ; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) @@ -2502,10 +2453,8 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX950-LABEL: v_fmaximum3_v3f16__fneg_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b @@ -2567,10 +2516,8 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, 2.0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -2632,10 +2579,8 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 4.0, 4.0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, 4.0 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>) @@ -2703,10 +2648,8 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX950-LABEL: v_fmaximum3_v4f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0) @@ -2774,10 +2717,8 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX950-LABEL: v_fmaximum3_v4f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2856,16 +2797,14 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX950-LABEL: v_fmaximum3_v4f16__fabs_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 ; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) @@ -2936,10 +2875,8 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b @@ -3008,10 +2945,8 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, v3 op_sel_hi:[1,0,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -3079,10 +3014,8 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 4.0, 4.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, 4.0 op_sel_hi:[1,1,0] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>) diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 8ba73071d9adb..34d7e5acb7896 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1269,9 +1269,7 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) { ; GFX950-LABEL: v_fminimum3_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1306,9 +1304,7 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) { ; GFX950-LABEL: v_fminimum3_f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %c, half %max0) @@ -1346,10 +1342,9 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; ; GFX950-LABEL: s_fminimum3_f16: ; GFX950: ; %bb.0: -; GFX950-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s2, s2 +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, s0, v0, v1 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX950-NEXT: s_nop 0 @@ -1392,9 +1387,7 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b) @@ -1431,9 +1424,7 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs) @@ -1470,9 +1461,7 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1511,9 +1500,7 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1554,9 +1541,7 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %b.fneg = fneg half %b @@ -1597,9 +1582,7 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1641,9 +1624,7 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b) @@ -1680,9 +1661,7 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg) @@ -1719,9 +1698,7 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1758,9 +1735,7 @@ define half @v_fminimum3_f16_const0(half %b, half %c) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_movk_i32 s0, 0x4800 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 8.0, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1795,9 +1770,8 @@ define half @v_fminimum3_f16__const2(half %a, half %b) { ; GFX950-LABEL: v_fminimum3_f16__const2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_movk_i32 s0, 0x4800 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, s0 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half 8.0) @@ -1832,9 +1806,7 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) { ; GFX950-LABEL: v_fminimum3_f16_inlineimm0: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 4.0, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1869,9 +1841,7 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) { ; GFX950-LABEL: v_fminimum3_f16__inlineimm: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, 4.0 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half 4.0) @@ -1909,9 +1879,8 @@ define half @v_fminimum3_f16_const1_const2(half %a) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_movk_i32 s0, 0x4800 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 -; GFX950-NEXT: s_movk_i32 s0, 0x4c00 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, 0x4c00 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half 8.0) %max1 = call half @llvm.minimum.f16(half %max0, half 16.0) @@ -1959,9 +1928,7 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX950-LABEL: v_fminimum3_v2f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0) @@ -2009,9 +1976,7 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX950-LABEL: v_fminimum3_v2f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -2067,9 +2032,7 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) @@ -2120,9 +2083,7 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX950-LABEL: v_fminimum3_v2f16__fneg_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b @@ -2173,9 +2134,7 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -2223,9 +2182,7 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, 4.0 op_sel_hi:[1,1,0] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>) @@ -2287,10 +2244,8 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX950-LABEL: v_fminimum3_v3f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0) @@ -2352,10 +2307,8 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX950-LABEL: v_fminimum3_v3f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -2428,16 +2381,14 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX950-LABEL: v_fminimum3_v3f16__fabs_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 ; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) @@ -2502,10 +2453,8 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX950-LABEL: v_fminimum3_v3f16__fneg_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b @@ -2567,10 +2516,8 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, 2.0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -2632,10 +2579,8 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 4.0, 4.0 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, 4.0 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>) @@ -2703,10 +2648,8 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX950-LABEL: v_fminimum3_v4f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0) @@ -2774,10 +2717,8 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX950-LABEL: v_fminimum3_v4f16_commute: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2856,16 +2797,14 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX950-LABEL: v_fminimum3_v4f16__fabs_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 ; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) @@ -2936,10 +2875,8 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX950-LABEL: v_fminimum3_v4f16__fneg_all: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b @@ -3008,10 +2945,8 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, v3 op_sel_hi:[1,0,1] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -3079,10 +3014,8 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm2: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 4.0, 4.0 op_sel_hi:[1,0,0] -; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, 4.0 op_sel_hi:[1,1,0] ; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits