https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/146076
>From 8710de705f09d90f166f82c1733620b2c8581306 Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Fri, 27 Jun 2025 05:38:52 -0400 Subject: [PATCH 1/3] [AMDGPU][SDAG] Enable ISD::PTRADD for 64-bit AS by default Also removes the command line option to control this feature. There seem to be mainly two kinds of test changes: - Some operands of addition instructions are swapped; that is to be expected since PTRADD is not commutative. - Improvements in code generation, probably because the legacy lowering enabled some transformations that were sometimes harmful. For SWDEV-516125. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +- .../identical-subrange-spill-infloop.ll | 352 +++++++++++------- .../AMDGPU/infer-addrspace-flat-atomic.ll | 14 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 8 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 4 +- .../AMDGPU/lower-module-lds-via-table.ll | 16 +- .../match-perm-extract-vector-elt-bug.ll | 22 +- llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 16 +- .../AMDGPU/preload-implicit-kernargs.ll | 6 +- .../AMDGPU/promote-constOffset-to-imm.ll | 8 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll | 7 +- .../AMDGPU/ptradd-sdag-optimizations.ll | 94 ++--- .../AMDGPU/ptradd-sdag-undef-poison.ll | 6 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 27 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 29 +- 15 files changed, 310 insertions(+), 309 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a1af50dac7e54..05ab745171f6d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -63,14 +63,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -// TODO: This option should be removed once we switch to always using PTRADD in -// the SelectionDAG. -static cl::opt<bool> UseSelectionDAGPTRADD( - "amdgpu-use-sdag-ptradd", cl::Hidden, - cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " - "SelectionDAG ISel"), - cl::init(false)); - static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); @@ -11252,7 +11244,7 @@ static bool isNoUnsignedWrap(SDValue Addr) { bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { - return UseSelectionDAGPTRADD && PtrVT == MVT::i64; + return PtrVT == MVT::i64; } bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F, diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 2c03113e8af47..805cdd37d6e70 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,96 +6,150 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v5, s30, 0 -; CHECK-NEXT: v_writelane_b32 v5, s31, 1 -; CHECK-NEXT: v_writelane_b32 v5, s36, 2 -; CHECK-NEXT: v_writelane_b32 v5, s37, 3 -; CHECK-NEXT: v_writelane_b32 v5, s38, 4 -; CHECK-NEXT: v_writelane_b32 v5, s39, 5 -; CHECK-NEXT: v_writelane_b32 v5, s48, 6 -; CHECK-NEXT: v_writelane_b32 v5, s49, 7 -; CHECK-NEXT: v_writelane_b32 v5, s50, 8 -; CHECK-NEXT: v_writelane_b32 v5, s51, 9 -; CHECK-NEXT: v_writelane_b32 v5, s52, 10 -; CHECK-NEXT: v_writelane_b32 v5, s53, 11 -; CHECK-NEXT: v_writelane_b32 v5, s54, 12 -; CHECK-NEXT: v_writelane_b32 v5, s55, 13 -; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v5, s64, 14 -; CHECK-NEXT: s_movk_i32 s4, 0xf0 -; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v5, s65, 15 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_writelane_b32 v5, s66, 16 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s67, 17 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x130 -; CHECK-NEXT: s_mov_b32 s7, s24 -; CHECK-NEXT: v_writelane_b32 v5, s68, 18 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s69, 19 -; CHECK-NEXT: v_writelane_b32 v5, s70, 20 +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: v_writelane_b32 v6, s36, 2 +; CHECK-NEXT: v_writelane_b32 v6, s37, 3 +; CHECK-NEXT: v_writelane_b32 v6, s38, 4 +; CHECK-NEXT: v_writelane_b32 v6, s39, 5 +; CHECK-NEXT: v_writelane_b32 v6, s48, 6 +; CHECK-NEXT: v_writelane_b32 v6, s49, 7 +; CHECK-NEXT: v_writelane_b32 v6, s50, 8 +; CHECK-NEXT: v_writelane_b32 v6, s51, 9 +; CHECK-NEXT: v_writelane_b32 v6, s52, 10 +; CHECK-NEXT: v_writelane_b32 v6, s53, 11 +; CHECK-NEXT: v_writelane_b32 v6, s54, 12 +; CHECK-NEXT: v_writelane_b32 v6, s55, 13 +; CHECK-NEXT: v_writelane_b32 v6, s64, 14 +; CHECK-NEXT: v_writelane_b32 v6, s65, 15 +; CHECK-NEXT: v_writelane_b32 v6, s66, 16 +; CHECK-NEXT: v_writelane_b32 v6, s67, 17 +; CHECK-NEXT: v_writelane_b32 v6, s68, 18 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: v_writelane_b32 v6, s69, 19 ; CHECK-NEXT: s_mov_b32 s68, 0 -; CHECK-NEXT: v_writelane_b32 v5, s71, 21 +; CHECK-NEXT: s_mov_b32 s69, s4 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[24:31], s[68:69], 0x30 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0xf0 +; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9 +; CHECK-NEXT: v_writelane_b32 v6, s70, 20 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_writelane_b32 v6, s71, 21 +; CHECK-NEXT: v_writelane_b32 v7, s36, 0 +; CHECK-NEXT: v_writelane_b32 v7, s37, 1 +; CHECK-NEXT: v_writelane_b32 v7, s38, 2 +; CHECK-NEXT: v_writelane_b32 v7, s39, 3 +; CHECK-NEXT: v_writelane_b32 v7, s40, 4 +; CHECK-NEXT: v_writelane_b32 v7, s41, 5 +; CHECK-NEXT: v_writelane_b32 v7, s42, 6 +; CHECK-NEXT: v_writelane_b32 v7, s43, 7 +; CHECK-NEXT: v_writelane_b32 v7, s44, 8 +; CHECK-NEXT: v_writelane_b32 v7, s45, 9 +; CHECK-NEXT: v_writelane_b32 v7, s46, 10 +; CHECK-NEXT: v_writelane_b32 v7, s47, 11 +; CHECK-NEXT: v_writelane_b32 v7, s48, 12 +; CHECK-NEXT: v_writelane_b32 v7, s49, 13 +; CHECK-NEXT: v_writelane_b32 v7, s50, 14 +; CHECK-NEXT: v_writelane_b32 v7, s51, 15 +; CHECK-NEXT: v_writelane_b32 v7, s8, 16 +; CHECK-NEXT: v_writelane_b32 v7, s9, 17 +; CHECK-NEXT: v_writelane_b32 v7, s10, 18 +; CHECK-NEXT: v_writelane_b32 v7, s11, 19 +; CHECK-NEXT: v_writelane_b32 v7, s12, 20 +; CHECK-NEXT: v_writelane_b32 v7, s13, 21 +; CHECK-NEXT: v_writelane_b32 v7, s14, 22 +; CHECK-NEXT: v_writelane_b32 v7, s15, 23 +; CHECK-NEXT: v_writelane_b32 v7, s16, 24 +; CHECK-NEXT: v_writelane_b32 v7, s17, 25 +; CHECK-NEXT: v_writelane_b32 v7, s18, 26 +; CHECK-NEXT: v_writelane_b32 v7, s19, 27 +; CHECK-NEXT: v_writelane_b32 v7, s20, 28 +; CHECK-NEXT: v_writelane_b32 v7, s21, 29 +; CHECK-NEXT: v_writelane_b32 v7, s22, 30 +; CHECK-NEXT: v_writelane_b32 v7, s23, 31 +; CHECK-NEXT: v_readlane_b32 s4, v7, 0 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0x1f0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s5, v7, 1 +; CHECK-NEXT: v_readlane_b32 s6, v7, 2 +; CHECK-NEXT: v_readlane_b32 s7, v7, 3 +; CHECK-NEXT: v_readlane_b32 s8, v7, 4 +; CHECK-NEXT: v_readlane_b32 s9, v7, 5 +; CHECK-NEXT: v_readlane_b32 s10, v7, 6 +; CHECK-NEXT: v_readlane_b32 s11, v7, 7 +; CHECK-NEXT: v_readlane_b32 s12, v7, 8 +; CHECK-NEXT: v_readlane_b32 s13, v7, 9 +; CHECK-NEXT: v_readlane_b32 s14, v7, 10 +; CHECK-NEXT: v_readlane_b32 s15, v7, 11 +; CHECK-NEXT: v_readlane_b32 s16, v7, 12 +; CHECK-NEXT: v_readlane_b32 s17, v7, 13 +; CHECK-NEXT: v_readlane_b32 s18, v7, 14 +; CHECK-NEXT: v_readlane_b32 s19, v7, 15 +; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[20:21], 1, v5 +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s4, v7, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: s_mov_b32 s6, 48 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v6, s36, 0 -; CHECK-NEXT: v_writelane_b32 v6, s37, 1 -; CHECK-NEXT: v_writelane_b32 v6, s38, 2 -; CHECK-NEXT: v_writelane_b32 v6, s39, 3 -; CHECK-NEXT: v_writelane_b32 v6, s40, 4 -; CHECK-NEXT: v_writelane_b32 v6, s41, 5 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v6, s42, 6 -; CHECK-NEXT: v_writelane_b32 v6, s43, 7 -; CHECK-NEXT: v_writelane_b32 v6, s44, 8 -; CHECK-NEXT: v_writelane_b32 v6, s45, 9 -; CHECK-NEXT: v_writelane_b32 v6, s46, 10 -; CHECK-NEXT: v_writelane_b32 v6, s47, 11 -; CHECK-NEXT: v_writelane_b32 v6, s48, 12 -; CHECK-NEXT: v_writelane_b32 v6, s49, 13 -; CHECK-NEXT: v_writelane_b32 v6, s50, 14 -; CHECK-NEXT: s_movk_i32 s56, 0x1f0 -; CHECK-NEXT: s_movk_i32 s72, 0x2f0 -; CHECK-NEXT: s_mov_b32 s57, s24 -; CHECK-NEXT: s_mov_b32 s73, s24 -; CHECK-NEXT: v_writelane_b32 v6, s51, 15 -; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CHECK-NEXT: v_readlane_b32 s5, v7, 17 +; CHECK-NEXT: v_readlane_b32 s6, v7, 18 +; CHECK-NEXT: v_readlane_b32 s7, v7, 19 +; CHECK-NEXT: v_readlane_b32 s8, v7, 20 +; CHECK-NEXT: v_readlane_b32 s9, v7, 21 +; CHECK-NEXT: v_readlane_b32 s10, v7, 22 +; CHECK-NEXT: v_readlane_b32 s11, v7, 23 +; CHECK-NEXT: v_readlane_b32 s12, v7, 24 +; CHECK-NEXT: v_readlane_b32 s13, v7, 25 +; CHECK-NEXT: v_readlane_b32 s14, v7, 26 +; CHECK-NEXT: v_readlane_b32 s15, v7, 27 +; CHECK-NEXT: v_readlane_b32 s16, v7, 28 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[4:11], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s17, v7, 29 +; CHECK-NEXT: v_readlane_b32 s18, v7, 30 +; CHECK-NEXT: v_readlane_b32 s19, v7, 31 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[20:21] +; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_readlane_b32 s4, v7, 0 +; CHECK-NEXT: v_readlane_b32 s12, v7, 8 +; CHECK-NEXT: v_readlane_b32 s13, v7, 9 +; CHECK-NEXT: v_readlane_b32 s14, v7, 10 +; CHECK-NEXT: v_readlane_b32 s15, v7, 11 +; CHECK-NEXT: v_readlane_b32 s16, v7, 12 +; CHECK-NEXT: v_readlane_b32 s17, v7, 13 +; CHECK-NEXT: v_readlane_b32 s18, v7, 14 +; CHECK-NEXT: v_readlane_b32 s19, v7, 15 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: v_readlane_b32 s5, v7, 1 +; CHECK-NEXT: v_readlane_b32 s6, v7, 2 +; CHECK-NEXT: v_readlane_b32 s7, v7, 3 +; CHECK-NEXT: v_readlane_b32 s8, v7, 4 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[68:71] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_readlane_b32 s9, v7, 5 +; CHECK-NEXT: v_readlane_b32 s10, v7, 6 +; CHECK-NEXT: v_readlane_b32 s11, v7, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_mov_b32 s69, s68 -; CHECK-NEXT: s_mov_b32 s70, s68 -; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[28:31] dmask:0x1 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[68:71] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 @@ -103,50 +157,66 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[22:23] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[20:21] ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v2, s16 -; CHECK-NEXT: v_mov_b32_e32 v3, s17 +; CHECK-NEXT: v_mov_b32_e32 v0, s16 +; CHECK-NEXT: s_mov_b64 s[10:11], s[54:55] +; CHECK-NEXT: s_mov_b64 s[12:13], s[56:57] +; CHECK-NEXT: s_mov_b64 s[14:15], s[58:59] +; CHECK-NEXT: v_readlane_b32 s44, v7, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39] -; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41] -; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43] -; CHECK-NEXT: v_readlane_b32 s36, v6, 0 -; CHECK-NEXT: v_readlane_b32 s44, v6, 8 -; CHECK-NEXT: v_readlane_b32 s45, v6, 9 -; CHECK-NEXT: v_readlane_b32 s46, v6, 10 -; CHECK-NEXT: v_readlane_b32 s47, v6, 11 -; CHECK-NEXT: v_readlane_b32 s48, v6, 12 -; CHECK-NEXT: v_readlane_b32 s49, v6, 13 -; CHECK-NEXT: v_readlane_b32 s50, v6, 14 -; CHECK-NEXT: v_readlane_b32 s51, v6, 15 -; CHECK-NEXT: v_readlane_b32 s37, v6, 1 -; CHECK-NEXT: v_readlane_b32 s38, v6, 2 -; CHECK-NEXT: v_readlane_b32 s39, v6, 3 -; CHECK-NEXT: v_readlane_b32 s40, v6, 4 -; CHECK-NEXT: v_readlane_b32 s41, v6, 5 -; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s42, v6, 6 -; CHECK-NEXT: v_readlane_b32 s43, v6, 7 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15] -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13] -; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: v_readlane_b32 s45, v7, 1 +; CHECK-NEXT: v_readlane_b32 s46, v7, 2 +; CHECK-NEXT: v_readlane_b32 s47, v7, 3 +; CHECK-NEXT: v_readlane_b32 s48, v7, 4 +; CHECK-NEXT: v_readlane_b32 s49, v7, 5 +; CHECK-NEXT: v_readlane_b32 s50, v7, 6 +; CHECK-NEXT: v_readlane_b32 s51, v7, 7 +; CHECK-NEXT: v_readlane_b32 s52, v7, 8 +; CHECK-NEXT: v_readlane_b32 s53, v7, 9 +; CHECK-NEXT: v_readlane_b32 s54, v7, 10 +; CHECK-NEXT: v_readlane_b32 s55, v7, 11 +; CHECK-NEXT: v_readlane_b32 s56, v7, 12 +; CHECK-NEXT: v_readlane_b32 s57, v7, 13 +; CHECK-NEXT: v_readlane_b32 s58, v7, 14 +; CHECK-NEXT: v_readlane_b32 s59, v7, 15 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s44, v7, 16 +; CHECK-NEXT: v_readlane_b32 s52, v7, 24 +; CHECK-NEXT: v_readlane_b32 s53, v7, 25 +; CHECK-NEXT: v_readlane_b32 s54, v7, 26 +; CHECK-NEXT: v_readlane_b32 s55, v7, 27 +; CHECK-NEXT: v_readlane_b32 s56, v7, 28 +; CHECK-NEXT: v_readlane_b32 s57, v7, 29 +; CHECK-NEXT: v_readlane_b32 s58, v7, 30 +; CHECK-NEXT: v_readlane_b32 s59, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_readlane_b32 s45, v7, 17 +; CHECK-NEXT: v_readlane_b32 s46, v7, 18 +; CHECK-NEXT: v_readlane_b32 s47, v7, 19 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 +; CHECK-NEXT: s_mov_b64 s[58:59], s[14:15] +; CHECK-NEXT: v_readlane_b32 s48, v7, 20 +; CHECK-NEXT: v_readlane_b32 s49, v7, 21 +; CHECK-NEXT: v_readlane_b32 s50, v7, 22 +; CHECK-NEXT: v_readlane_b32 s51, v7, 23 +; CHECK-NEXT: s_mov_b64 s[56:57], s[12:13] +; CHECK-NEXT: s_mov_b64 s[54:55], s[10:11] +; CHECK-NEXT: s_mov_b64 s[52:53], s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[16:19], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 @@ -154,17 +224,17 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s12, s8 -; CHECK-NEXT: s_mov_b32 s13, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s12 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s13 +; CHECK-NEXT: s_mov_b32 s16, 0 +; CHECK-NEXT: s_mov_b32 s20, s16 +; CHECK-NEXT: s_mov_b32 s21, s16 +; CHECK-NEXT: v_mov_b32_e32 v1, s20 +; CHECK-NEXT: s_mov_b32 s17, s16 +; CHECK-NEXT: s_mov_b32 s18, s16 +; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: v_mov_b32_e32 v2, s21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[52:59], s[16:19] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 @@ -180,32 +250,32 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_readlane_b32 s71, v5, 21 -; CHECK-NEXT: v_readlane_b32 s70, v5, 20 -; CHECK-NEXT: v_readlane_b32 s69, v5, 19 -; CHECK-NEXT: v_readlane_b32 s68, v5, 18 +; CHECK-NEXT: v_readlane_b32 s71, v6, 21 +; CHECK-NEXT: v_readlane_b32 s70, v6, 20 +; CHECK-NEXT: v_readlane_b32 s69, v6, 19 +; CHECK-NEXT: v_readlane_b32 s68, v6, 18 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s67, v5, 17 -; CHECK-NEXT: v_readlane_b32 s66, v5, 16 -; CHECK-NEXT: v_readlane_b32 s65, v5, 15 -; CHECK-NEXT: v_readlane_b32 s64, v5, 14 -; CHECK-NEXT: v_readlane_b32 s55, v5, 13 -; CHECK-NEXT: v_readlane_b32 s54, v5, 12 -; CHECK-NEXT: v_readlane_b32 s53, v5, 11 -; CHECK-NEXT: v_readlane_b32 s52, v5, 10 -; CHECK-NEXT: v_readlane_b32 s51, v5, 9 -; CHECK-NEXT: v_readlane_b32 s50, v5, 8 -; CHECK-NEXT: v_readlane_b32 s49, v5, 7 -; CHECK-NEXT: v_readlane_b32 s48, v5, 6 -; CHECK-NEXT: v_readlane_b32 s39, v5, 5 -; CHECK-NEXT: v_readlane_b32 s38, v5, 4 -; CHECK-NEXT: v_readlane_b32 s37, v5, 3 -; CHECK-NEXT: v_readlane_b32 s36, v5, 2 -; CHECK-NEXT: v_readlane_b32 s31, v5, 1 -; CHECK-NEXT: v_readlane_b32 s30, v5, 0 +; CHECK-NEXT: v_readlane_b32 s67, v6, 17 +; CHECK-NEXT: v_readlane_b32 s66, v6, 16 +; CHECK-NEXT: v_readlane_b32 s65, v6, 15 +; CHECK-NEXT: v_readlane_b32 s64, v6, 14 +; CHECK-NEXT: v_readlane_b32 s55, v6, 13 +; CHECK-NEXT: v_readlane_b32 s54, v6, 12 +; CHECK-NEXT: v_readlane_b32 s53, v6, 11 +; CHECK-NEXT: v_readlane_b32 s52, v6, 10 +; CHECK-NEXT: v_readlane_b32 s51, v6, 9 +; CHECK-NEXT: v_readlane_b32 s50, v6, 8 +; CHECK-NEXT: v_readlane_b32 s49, v6, 7 +; CHECK-NEXT: v_readlane_b32 s48, v6, 6 +; CHECK-NEXT: v_readlane_b32 s39, v6, 5 +; CHECK-NEXT: v_readlane_b32 s38, v6, 4 +; CHECK-NEXT: v_readlane_b32 s37, v6, 3 +; CHECK-NEXT: v_readlane_b32 s36, v6, 2 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 59dfd713ef4fd..bd11b0710fadd 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -11,8 +11,8 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s2, s0 -; CHECK-NEXT: s_addc_u32 s1, s3, s1 +; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 ; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc @@ -69,13 +69,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -7, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm @@ -113,7 +113,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: s_add_u32 s4, s0, -8 ; CHECK-NEXT: s_addc_u32 s5, s1, -1 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 1 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 48bf7fbe0a3cb..3eef616ba267d 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -46,8 +46,8 @@ define void @use_extern_normal() #0 { ; CHECK-NEXT: s_ashr_i32 s5, s15, 31 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; CHECK-NEXT: s_add_u32 s4, s4, s6 -; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_add_u32 s4, s6, s4 +; CHECK-NEXT: s_addc_u32 s5, s7, s5 ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 @@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 { ; CHECK-NEXT: s_ashr_i32 s5, s15, 31 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; CHECK-NEXT: s_add_u32 s4, s4, s6 -; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_add_u32 s4, s6, s4 +; CHECK-NEXT: s_addc_u32 s5, s7, s5 ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index b6f70fa6a9892..12212a0968c96 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -84,8 +84,8 @@ define void @f2() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index c316f03dde89b..b689e1e51c2a4 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -49,8 +49,8 @@ define void @f0() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -90,8 +90,8 @@ define void @f1() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -131,8 +131,8 @@ define void @f2() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -172,8 +172,8 @@ define void @f3() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 65b4d37a8d583..93d772fdb7854 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -13,9 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_mul_i32 s14, s14, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s14 -; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc @@ -37,12 +37,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dword s4, s[8:9], 0x1c ; GFX10-NEXT: s_load_dword s5, s[8:9], 0x38 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: s_mul_i32 s14, s14, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX10-NEXT: v_add3_u32 v2, s5, s14, v0 +; GFX10-NEXT: v_ashrrev_i64 v[4:5], 28, v[1:2] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4 @@ -62,21 +62,19 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x1c ; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x38 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s6, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s13, s13, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add3_u32 v0, s7, s13, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX11-NEXT: v_add3_u32 v1, s7, s13, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1] ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index dd5c247f6ef35..14b0729b37302 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -388,8 +388,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -684,8 +684,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB4_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -1411,8 +1411,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -1889,8 +1889,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB15_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index f5e136a80b4a8..b717f85e179b3 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -337,8 +337,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX942-NEXT: .p2align 8 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: .LBB8_0: -; GFX942-NEXT: s_mov_b32 s4, 8 -; GFX942-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2 +; GFX942-NEXT: s_load_dword s0, s[0:1], 0xa ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s0 @@ -353,8 +352,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB8_0: -; GFX90a-NEXT: s_mov_b32 s0, 8 -; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0xa ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b1e05158b6212..e2b5323bcf02c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -615,8 +615,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_movk_i32 s1, 0x7f ; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader @@ -833,8 +833,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0 +; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll index ff90f1f175c3c..40f39a24d7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX6,GFX6_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX6,GFX6_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s ; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF. @@ -34,7 +33,3 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in store i32 %result, ptr addrspace(1) %out ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX6_LEGACY: {{.*}} -; GFX6_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 7d3b19e885877..1c986a02e8bd6 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 < %s | FileCheck --check-prefixes=GFX942 %s ; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG ; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable @@ -24,21 +23,13 @@ define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { } define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_load_gep_add_reassoc: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %add0 = add nuw nsw i64 %voffset, 24 %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 %l = load i64, ptr addrspace(1) %gep0, align 8 @@ -221,23 +212,14 @@ define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 % ; Check that offsets are folded into global addresses if possible. For example, ; this is relevant when using --amdgpu-lower-module-lds-strategy=table. define ptr addrspace(1) @complextype_global_gep(i64 %offset) { -; GFX942_PTRADD-LABEL: complextype_global_gep: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1] -; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 -; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: complextype_global_gep: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_getpc_b64 s[0:1] -; GFX942_LEGACY-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 -; GFX942_LEGACY-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: complextype_global_gep: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 +; GFX942-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2 ret ptr addrspace(1) %gep1 @@ -430,36 +412,20 @@ define ptr @gep_disjoint_or(ptr %base) { ; Check that AssertAlign nodes between ptradd nodes don't block offset folding, ; taken from preload-implicit-kernargs.ll define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { -; GFX942_PTRADD-LABEL: random_incorrect_offset: -; GFX942_PTRADD: ; %bb.1: -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_branch .LBB21_0 -; GFX942_PTRADD-NEXT: .p2align 8 -; GFX942_PTRADD-NEXT: ; %bb.2: -; GFX942_PTRADD-NEXT: .LBB21_0: -; GFX942_PTRADD-NEXT: s_load_dword s0, s[4:5], 0xa -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0 -; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[8:9] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: random_incorrect_offset: -; GFX942_LEGACY: ; %bb.1: -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_branch .LBB21_0 -; GFX942_LEGACY-NEXT: .p2align 8 -; GFX942_LEGACY-NEXT: ; %bb.2: -; GFX942_LEGACY-NEXT: .LBB21_0: -; GFX942_LEGACY-NEXT: s_mov_b32 s0, 8 -; GFX942_LEGACY-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0 -; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[8:9] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: random_incorrect_offset: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB21_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB21_0: +; GFX942-NEXT: s_load_dword s0, s[4:5], 0xa +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: global_store_dword v0, v1, s[8:9] +; GFX942-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 %load = load i32, ptr addrspace(4) %gep diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll index 1934ce395e63d..e7c715f0a38bf 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel < %s | FileCheck --check-prefixes=GFX942 %s ; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode. ; If any additions are generated for these tests, the folds don't work. @@ -44,6 +43,3 @@ define ptr @undef_base(ptr %p, i64 %offset) { %gep1 = getelementptr i8, ptr undef, i64 %offset ret ptr %gep1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX942_LEGACY: {{.*}} -; GFX942_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 9dd25025d4381..f4f5a78f0e2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -1,14 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12 ; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address ; spaces since PTRADD is currently only used for these. @@ -511,15 +506,3 @@ entry: store i32 %val, ptr addrspace(1) %gep.to, align 4 ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10_LEGACY: {{.*}} -; GFX10_PTRADD: {{.*}} -; GFX11_LEGACY: {{.*}} -; GFX11_PTRADD: {{.*}} -; GFX12_LEGACY: {{.*}} -; GFX12_PTRADD: {{.*}} -; GFX8_LEGACY: {{.*}} -; GFX8_PTRADD: {{.*}} -; GFX942_LEGACY: {{.*}} -; GFX942_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 65a99d0d097f9..480eb0dd5fe9c 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -52,11 +52,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: ; HAWAII-NEXT: s_add_i32 s12, s12, s17 -; HAWAII-NEXT: s_or_b32 s0, s8, 14 -; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 ; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; HAWAII-NEXT: s_add_u32 s0, s8, 14 +; HAWAII-NEXT: s_addc_u32 s1, s9, 0 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s9 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] ; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0 ; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 @@ -74,25 +75,27 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; FIJI-NEXT: s_add_i32 s12, s12, s17 -; FIJI-NEXT: s_or_b32 s0, s8, 14 -; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s9 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 -; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_and_b32 s3, s1, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: s_and_b32 s4, s1, 0xffff +; FIJI-NEXT: s_add_u32 s2, s8, 14 +; FIJI-NEXT: s_addc_u32 s3, s9, 0 +; FIJI-NEXT: v_mov_b32_e32 v0, s2 +; FIJI-NEXT: v_mov_b32_e32 v1, s3 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0 ; FIJI-NEXT: v_mov_b32_e32 v2, s1 ; FIJI-NEXT: v_mov_b32_e32 v3, s0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s2 ; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s4, v0 ; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 ; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 ; FIJI-NEXT: ds_write_b32 v1, v3 >From 2243d29fb28c8eec6abe76b3cd51294373b41e0c Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Wed, 6 Aug 2025 08:52:52 -0400 Subject: [PATCH 2/3] Update checks for updated test loop-prefetch-data.ll This only changes the order of some addition operands. --- llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index ea9d5e8a0bc1f..1e6b77ecea85e 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -400,9 +400,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v2, s1, v0, s6 +; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 ; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-NEXT: s_wait_alu 0xf1ff @@ -438,9 +438,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 -; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, v0, s6 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0 ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff @@ -531,9 +531,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v2, s1, v0, s6 +; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 ; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-NEXT: s_wait_alu 0xf1ff @@ -569,9 +569,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 -; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, v0, s6 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0 ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff >From 26d807215939ea8cecd740809cb4d5a3f7859ad2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Thu, 7 Aug 2025 04:34:36 -0400 Subject: [PATCH 3/3] Update checks for new test no-folding-imm-to-inst-with-fi.ll --- .../AMDGPU/no-folding-imm-to-inst-with-fi.ll | 75 ++++++------------- 1 file changed, 24 insertions(+), 51 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll index 6d0aa1e784530..7e4be65898b65 100644 --- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -9,92 +9,65 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v ; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4 ; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4 ; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base -; CHECK-NEXT: s_movk_i32 s33, 0x70 -; CHECK-NEXT: s_movk_i32 s34, 0x60 -; CHECK-NEXT: s_or_b32 s44, 0x80, s33 -; CHECK-NEXT: s_mov_b32 s45, s35 -; CHECK-NEXT: s_or_b32 s46, 0x80, s34 -; CHECK-NEXT: s_mov_b32 s47, s35 -; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 -; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47 ; CHECK-NEXT: s_movk_i32 s34, 0x80 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35 +; CHECK-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v21, s35 ; CHECK-NEXT: s_wait_kmcnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 ; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 ; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37 ; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39 -; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS -; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 -; CHECK-NEXT: s_movk_i32 s20, 0x50 ; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29 ; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31 -; CHECK-NEXT: s_wait_alu 0xfffe -; CHECK-NEXT: s_or_b32 s20, 0x80, s20 -; CHECK-NEXT: s_mov_b32 s21, s35 ; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 ; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23 -; CHECK-NEXT: s_wait_alu 0xfffe -; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20 +; CHECK-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; CHECK-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:112 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:96 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] offset:80 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 -; CHECK-NEXT: s_or_b32 s16, 0x80, 64 -; CHECK-NEXT: s_mov_b32 s17, s35 -; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; CHECK-NEXT: s_or_b32 s12, 0x80, 48 -; CHECK-NEXT: s_mov_b32 s13, s35 -; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; CHECK-NEXT: s_or_b32 s8, 0x80, 32 -; CHECK-NEXT: s_mov_b32 s9, s35 -; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 -; CHECK-NEXT: s_or_b32 s4, 0x80, 16 -; CHECK-NEXT: s_mov_b32 s5, s35 ; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; CHECK-NEXT: s_wait_alu 0xfffe -; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16 +; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 ; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 -; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12 -; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8 -; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4 +; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 ; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[4:7] offset:48 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:96 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:112 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:80 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:48 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: s_endpgm bb: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits