================ @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT: s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT: s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT: s_nop 1 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s17 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s18 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s19 +; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942_PTRADD-NEXT: s_nop 1 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s12 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s13 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s14 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s15 +; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX942_PTRADD-NEXT: s_nop 1 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s8 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s9 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s10 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s11 +; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942_PTRADD-NEXT: s_endpgm +; +; GFX942_LEGACY-LABEL: store_v16i32: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v4, 0 +; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s20 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s21 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s22 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s23 +; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX942_LEGACY-NEXT: s_nop 1 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s16 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s17 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s18 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s19 +; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942_LEGACY-NEXT: s_nop 1 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s12 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s13 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s14 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s15 +; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX942_LEGACY-NEXT: s_nop 1 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s8 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s9 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s10 +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s11 +; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942_LEGACY-NEXT: s_endpgm +entry: + store <16 x i32> %a, ptr addrspace(1) %out + ret void +} + + +; Tests the (ptradd 0, x) -> x DAG combine. +define void @baseptr_null(i64 %offset, i8 %v) { +; GFX942_PTRADD-LABEL: baseptr_null: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], 0, 0, v[0:1] +; GFX942_PTRADD-NEXT: flat_store_byte v[0:1], v2 +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: baseptr_null: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT: flat_store_byte v[0:1], v2 +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr null, i64 %offset + store i8 %v, ptr %gep, align 1 + ret void +} + +; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the +; assertalign DAG combine. +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { +; GFX942_PTRADD-LABEL: llvm_amdgcn_queue_ptr: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT: s_add_u32 s8, s4, 8 +; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 0 +; GFX942_PTRADD-NEXT: s_addc_u32 s9, s5, 0 +; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1 +; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[8:9] sc0 sc1 +; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1 +; GFX942_PTRADD-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942_PTRADD-NEXT: ; kill: killed $sgpr8 killed $sgpr9 +; GFX942_PTRADD-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT: s_endpgm +; +; GFX942_LEGACY-LABEL: llvm_amdgcn_queue_ptr: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 0 +; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1 +; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1 +; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1 +; GFX942_LEGACY-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942_LEGACY-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT: s_endpgm + %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + %queue.load = load volatile i8, ptr addrspace(4) %queue.ptr + %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr + %dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr + store volatile i64 %dispatch.id, ptr addrspace(1) %ptr + ret void +} + +declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() +declare noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +declare i64 @llvm.amdgcn.dispatch.id() +declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ---------------- ritter-x2a wrote:
The test works without these declarations, I included them since they are also in [implicit-kernarg-backend-usage.ll](https://github.com/llvm/llvm-project/blob/d363847d4c4f3922875c23c69fd0e6e0148c7eff/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll#L381-L384). Should I drop them? https://github.com/llvm/llvm-project/pull/142738 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits