llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Fix ABI on old subtargets so match new subtargets, packing 16-bit element subvectors into 32-bit registers. Previously this would be scalarized and promoted to i32/float. Note this only changes the vector cases. Scalar i16/half are still promoted to i32/float for now. I've unsuccessfully tried to make that switch in the past, so leave that for later. This will help with removal of softPromoteHalfType. --- Patch is 21.22 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175781.diff 157 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+8-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll (+17-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll (+74-48) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (+138-160) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+130-115) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll (+2-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll (+4-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll (-8) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll (+29-21) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll (+18-7) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+430-359) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll (+89-53) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll (+89-89) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+183-142) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+193-151) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+23-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (+5-14) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+102-91) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll (+138-160) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+340-290) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll (+77-69) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll (+108-78) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+336-287) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll (+8-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+190-146) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+158-114) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll (+13-28) - (modified) llvm/test/CodeGen/AMDGPU/abs_i16.ll (+480-518) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+44717-48398) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+2789-2023) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll (+923-650) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll (+1864-1301) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll (+1296-912) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+6057-4485) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll (+1674-1190) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+4865-3712) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+578-487) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll (+2057-1455) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll (+3897-2742) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll (+4677-3284) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+267-212) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+15986-12539) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+6571-7076) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+7768-8457) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+1321-973) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+8787-9729) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+10305-11655) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+11359-13002) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+12473-14142) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+13870-15638) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+1526-1148) - (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+4623-9054) - (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+15-34) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+565-437) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+255-203) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+255-203) - (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+11-31) - (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+56-33) - (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+5-1) - (modified) llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll (+342-226) - (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+14-17) - (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+50-42) - (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+259-248) - (modified) llvm/test/CodeGen/AMDGPU/extract-subvector.ll (+4-5) - (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+456-473) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+1236-2026) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1113-1096) - (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+48-42) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+447-351) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+303-235) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+303-235) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+299-231) - (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+136-132) - (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+177-147) - (modified) llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+31-19) - (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+136-132) - (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+177-147) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+99-93) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.ll (+10-15) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+20-41) - (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+42-70) - (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+11-24) - (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+52-56) - (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+68-56) - (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+40-36) - (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+150-509) - (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+117-330) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+1055-851) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+609-489) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+609-489) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+601-481) - (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+38-38) - (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+707-584) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll (+16-57) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll (+3-17) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll (+3-31) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll (+8-69) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll (+16-57) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll (+16-57) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+106-110) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+155-153) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+97-193) - (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+61-49) - (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+9-18) - (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+22-27) - (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+140-92) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+166-188) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+166-188) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+147-221) - (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+396-348) - (modified) llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (+11-8) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+283-221) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+296-234) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+296-234) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+283-221) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+26-14) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+372-340) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+99-75) - (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+449-551) - (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+912-774) - (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+449-551) - (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+912-774) - (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+85-63) - (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+80-60) - (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+106-72) - (modified) llvm/test/CodeGen/AMDGPU/saddsat.ll (+43-45) - (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll (+588-521) - (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+242-194) - (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+528-921) - (modified) llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll (+4-6) - (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+24-7) - (modified) llvm/test/CodeGen/AMDGPU/ssubsat.ll (+43-45) - (modified) llvm/test/CodeGen/AMDGPU/strict_fpext.ll (+14-29) - (modified) llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll (+13-4) - (modified) llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll (+79-173) - (modified) llvm/test/CodeGen/AMDGPU/trunc-combine.ll (+21-62) - (modified) llvm/test/CodeGen/AMDGPU/uaddsat.ll (+31-32) - (modified) llvm/test/CodeGen/AMDGPU/usubsat.ll (+37-38) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+68-40) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll (+66-77) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll (+96-97) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll (+96-97) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll (+88-105) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll (+96-97) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll (+88-105) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll (+96-97) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll (+66-43) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll (+64-67) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+107-109) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+107-109) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+80-80) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+86-94) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll (+68-69) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ed5988ee6efc3..49f5d514071e2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1113,7 +1113,7 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (Size == 16) { if (Subtarget->has16BitInsts()) return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2); - return VT.isInteger() ? MVT::i32 : MVT::f32; + return ScalarVT == MVT::f32 ? MVT::f32 : MVT::i32; } if (Size < 16) @@ -1139,7 +1139,7 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, unsigned Size = ScalarVT.getSizeInBits(); // FIXME: Should probably promote 8-bit vectors to i16. - if (Size == 16 && Subtarget->has16BitInsts()) + if (Size == 16) return (NumElts + 1) / 2; if (Size <= 32) @@ -1163,11 +1163,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts()) { - RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2); - IntermediateVT = RegisterVT; + if (Size == 16) { + MVT SimpleIntermediateVT = + MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2)); + IntermediateVT = SimpleIntermediateVT; + RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32; NumIntermediates = (NumElts + 1) / 2; - return NumIntermediates; + return (NumElts + 1) / 2; } if (Size == 32) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll index d6f1b142b36e0..5c60eb696f6b2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll @@ -200,10 +200,15 @@ define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s16, s16, s18 -; GFX7-NEXT: s_add_i32 s17, s17, s19 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: s_lshr_b32 s4, s16, 16 +; GFX7-NEXT: s_lshr_b32 s5, s17, 16 +; GFX7-NEXT: s_add_i32 s4, s4, s5 +; GFX7-NEXT: s_add_i32 s16, s16, s17 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: s_and_b32 s5, s16, 0xffff +; GFX7-NEXT: s_lshl_b32 s4, s4, 16 +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_add_v2i16: @@ -278,8 +283,14 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index 814acc3be1fc0..244d006844a09 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -9,8 +9,14 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16: @@ -40,13 +46,15 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16_fneg_lhs: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_lhs: @@ -79,13 +87,15 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) { ; GFX7-LABEL: v_add_v2i16_fneg_rhs: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_rhs: @@ -118,18 +128,16 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) { ; GFX7-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: @@ -165,8 +173,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -197,8 +210,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -230,8 +248,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -262,6 +285,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_sub_i32 s1, s1, 64 ; GFX7-NEXT: s_sub_i32 s0, s0, 64 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff @@ -304,6 +328,7 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_add_i32 s1, s1, 4 ; GFX7-NEXT: s_sub_i32 s0, s0, 64 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff @@ -346,6 +371,7 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_sub_i32 s1, s1, 64 ; GFX7-NEXT: s_add_i32 s0, s0, 4 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff @@ -388,9 +414,11 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_add_i32 s1, s1, s3 -; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -433,14 +461,12 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16_fneg_lhs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_add_i32 s1, s1, s3 -; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -488,14 +514,12 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) { ; GFX7-LABEL: s_add_v2i16_fneg_rhs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s3, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_or_b32 s2, s3, s2 -; GFX7-NEXT: s_xor_b32 s2, s2, 0x80008000 -; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_add_i32 s1, s1, s3 -; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -543,12 +567,6 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) { ; GFX7-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_or_b32 s0, s1, s0 -; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX7-NEXT: s_lshr_b32 s2, s0, 16 @@ -609,7 +627,11 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) { ; GFX7-LABEL: add_inline_imm_neg1_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_inline_imm_neg1_0: @@ -640,7 +662,11 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) { ; GFX7-LABEL: add_inline_imm_1_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_inline_imm_1_0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 22b63a7de5f89..29a688ccf280d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -513,14 +513,8 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) { define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16: @@ -546,14 +540,8 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1 define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s1, s0 +; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_commute: @@ -579,14 +567,8 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s1, s3, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_use: @@ -619,18 +601,9 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_xor_b32 s1, s4, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s3, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: @@ -662,26 +635,12 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg } define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GFX6-LABEL: v_andn2_v2i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_andn2_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_andn2_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_andn2_v2i16: ; GFX10PLUS: ; %bb.0: @@ -698,19 +657,19 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_xor_b... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/175781 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
