================
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200
-amdgpu-enable-machine-level-inliner < %s | FileCheck %s
+
+declare !callback !0 i32 @llvm.amdgcn.call.whole.wave.i32.p0(ptr, ...)
+
+define amdgpu_cs void @inline_simple_wwf(i32 %input, ptr addrspace(1) %output)
{
+; CHECK-LABEL: inline_simple_wwf:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s1, simple_whole_wave_func@abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, simple_whole_wave_func@abs32@lo
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: global_store_b32 v[40:41], v0, off
+; CHECK-NEXT: s_endpgm
+ %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr
@simple_whole_wave_func, i32 %input)
+ store i32 %result, ptr addrspace(1) %output
+ ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @simple_whole_wave_func(i1 %active, i32 %x) {
+ %result = add i32 %x, 42
+ ret i32 %result
+}
+
+define amdgpu_gfx_whole_wave i32 @another_whole_wave_func(i1 %active, i32 %a,
i32 %b) {
+ %sum = add i32 %a, %b
+ %result = mul i32 %sum, 2
+ ret i32 %result
+}
+
+define amdgpu_cs void @inline_multiple_wwf(i32 %x, i32 %y, ptr addrspace(1)
%out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: inline_multiple_wwf:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s1, simple_whole_wave_func@abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, simple_whole_wave_func@abs32@lo
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: v_dual_mov_b32 v41, v5 :: v_dual_mov_b32 v44, v0
+; CHECK-NEXT: v_dual_mov_b32 v40, v4 :: v_dual_mov_b32 v43, v3
+; CHECK-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v45, v1
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v46, v0 :: v_dual_mov_b32 v1, v45
+; CHECK-NEXT: v_mov_b32_e32 v0, v44
+; CHECK-NEXT: s_mov_b32 s1, another_whole_wave_func@abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, another_whole_wave_func@abs32@lo
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: global_store_b32 v[42:43], v46, off
+; CHECK-NEXT: global_store_b32 v[40:41], v0, off
+; CHECK-NEXT: s_endpgm
+ %result1 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr
@simple_whole_wave_func, i32 %x)
+ %result2 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr
@another_whole_wave_func, i32 %x, i32 %y)
+ store i32 %result1, ptr addrspace(1) %out1
+ store i32 %result2, ptr addrspace(1) %out2
+ ret void
+}
+
----------------
cmc-rep wrote:
Could we also have a test that tests one WWF called by multiple kernels? I
believe it should work, still having a test like that is helpful.
https://github.com/llvm/llvm-project/pull/169476
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits