================ @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s + +; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. + +define amdgpu_cs void @amdgpu_cs() #0 { +; CHECK-LABEL: amdgpu_cs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + ret void +} + +define amdgpu_kernel void @kernel() #0 { +; CHECK-LABEL: kernel: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + ret void +} + +define amdgpu_cs void @with_local() #0 { +; CHECK-LABEL: with_local: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: v_mov_b32_e32 v0, 13 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +; Check that we generate s_cselect for SP if we can fit +; the offset in an inline constant. +define amdgpu_cs void @with_calls_inline_const() #0 { +; CHECK-LABEL: with_calls_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_cselect_b32 s32, 0x1d0, 16 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; Check that we generate s_mov + s_cmovk if we can't +; fit the offset for SP in an inline constant. +define amdgpu_cs void @with_calls_no_inline_const() #0 { +; CHECK-LABEL: with_calls_no_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_movk_i32 s32, 0x100 +; CHECK-NEXT: s_cmovk_i32 s32, 0x2c0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, i32 61, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; We're going to limit this to 16 VGPRs, so we need to spill the rest. +define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 { +; CHECK-LABEL: with_spills: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:96 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:80 ; 16-byte Folded Spill +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_b128 v[8:11], v[0:1], off offset:112 +; CHECK-NEXT: global_load_b128 v[12:15], v[0:1], off offset:64 +; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:80 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:64 ; 16-byte Folded Spill +; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:32 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:48 ; 16-byte Folded Spill +; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:48 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:32 ; 16-byte Folded Spill +; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 ; 16-byte Folded Spill +; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 ; 16-byte Folded Spill +; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:96 +; CHECK-NEXT: global_store_b128 v[2:3], v[8:11], off offset:112 +; CHECK-NEXT: global_store_b128 v[2:3], v[12:15], off offset:64 +; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:80 +; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:48 th:TH_LOAD_LU ; 16-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:32 +; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:32 th:TH_LOAD_LU ; 16-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:48 +; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:16 th:TH_LOAD_LU ; 16-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off +; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 th:TH_LOAD_LU ; 16-byte Folded Reload +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %v = load <32 x i32>, ptr addrspace(1) %p1 + store <32 x i32> %v, ptr addrspace(1) %p2 + ret void +} + +define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { +; CHECK-LABEL: realign_stack: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_cmovk_i32 s33, 0x200 +; CHECK-NEXT: s_movk_i32 s32, 0x100 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112 +; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96 +; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80 +; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64 +; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48 +; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_cmovk_i32 s32, 0x300 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %v = alloca <32 x i32>, align 128, addrspace(5) + store <32 x i32> %x, ptr addrspace(5) %v + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; Non-entry functions and graphics shaders can't run on a compute queue, +; so they don't need to worry about CWSR. +define amdgpu_gs void @amdgpu_gs() #0 { +; CHECK-LABEL: amdgpu_gs: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_mov_b32 s32, 16 +; CHECK-NEXT: scratch_store_b8 off, v0, off scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define amdgpu_gfx void @amdgpu_gfx() #0 { +; CHECK-LABEL: amdgpu_gfx: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_mov_b32 s0, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: v_writelane_b32 v40, s0, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s0, v40, 2 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s33, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_setpc_b64 s[30:31] + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define void @default() #0 { +; CHECK-LABEL: default: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +declare amdgpu_gfx void @callee(i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-num-vgpr"="16"} + ---------------- arsenm wrote:
Chcek interaction with frame-pointer attribute? https://github.com/llvm/llvm-project/pull/130055 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits