llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Previously this combine would undo AMDGPU's new custom legalization of wide vector shuffles into 2 element pieces. The comment also states that this combine is only done before legalization, but the case with a build_vector source was unconditional. We probably don't want to do this if the multiple uses are full scalarization of the vector, but this seems to work well enough. Scalarizing extracts should have folded out pre-legalize. --- Patch is 393.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123712.diff 10 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+5) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll (+114-166) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll (+129-143) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll (+343-466) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll (+114-166) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll (+129-143) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll (+343-466) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll (+114-166) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll (+129-143) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll (+343-466) ``````````diff diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 671a14e6250fc6..eaa08b0ef32af2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23172,6 +23172,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { + // TODO: Check if shuffle mask is legal? + if (LegalOperations && TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VecVT) && + !VecOp.hasOneUse()) + return SDValue(); + SDValue InOp = SVInVec.getOperand(OrigElt); if (InOp.getValueType() != ScalarVT) { assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index fafdac7ffac62f..54c5390ca76c62 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -176,8 +176,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -192,8 +191,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -566,16 +565,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -583,16 +581,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -675,31 +672,26 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -873,8 +865,7 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -887,8 +878,7 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -972,8 +962,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -989,8 +978,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1138,14 +1126,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1155,14 +1142,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1303,16 +1289,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1320,16 +1305,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1723,8 +1707,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1740,8 +1723,7 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1776,14 +1758,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,14 +1774,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2152,8 +2132,7 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2166,8 +2145,7 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2246,16 +2224,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2263,16 +2240,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2516,15 +2492,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2534,15 +2508,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2826,16 +2798,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2843,16 +2814,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/123712 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits