https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/127563
The real legality check is peformed later anyway, so this was unnecessarily blocking immediate folds in handled cases. This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests, but that seems better. The globalisel changes look suspicious, it may be mishandling constants for VOP3P instructions. >From 3dd61c69e1cd3cab752cac624c0a5be42b0ca193 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Mon, 17 Feb 2025 22:31:48 +0700 Subject: [PATCH] AMDGPU: Fix overly conservative immediate operand check The real legality check is peformed later anyway, so this was unnecessarily blocking immediate folds in handled cases. This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests, but that seems better. The globalisel changes look suspicious, it may be mishandling constants for VOP3P instructions. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 3 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll | 16 ++++------------ llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll | 16 ++++------------ llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 4 +--- llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll | 6 ++---- llvm/test/CodeGen/AMDGPU/constrained-shift.ll | 6 ++---- .../CodeGen/AMDGPU/fold-operands-scalar-fmac.mir | 4 ++-- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 5 +---- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 10 +++++----- llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll | 4 ++-- 10 files changed, 25 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 84773349e0ca0..cbd858b9002ee 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -830,7 +830,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (UseOpIdx >= Desc.getNumOperands()) return false; - if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx)) + // Filter out unhandled pseudos. + if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 4be00fedb972e..89078f20f1d47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -920,9 +920,7 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -962,9 +960,7 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1004,9 +1000,7 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1060,9 +1054,7 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 ; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, s6 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index e7119c89ac06c..065fadf3b5ef3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -919,9 +919,7 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -961,9 +959,7 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1003,9 +999,7 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1059,9 +1053,7 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg % ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 ; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, s6 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index ed85fb19d9051..43322b1e23412 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -118,13 +118,11 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; GFX7-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_mov_b32 s8, -1 ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_mov_b32 s9, s8 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: scalar_xnor_v4i16_one_use: diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll index f6fc69a6e3e47..ea93e3ac1e595 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll @@ -5,16 +5,14 @@ define amdgpu_cs <2 x i32> @f() { ; CHECK-LABEL: f: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mov_b32 s5, s4 ; CHECK-NEXT: s_mov_b32 s6, s4 ; CHECK-NEXT: s_mov_b32 s7, s4 -; CHECK-NEXT: s_mov_b32 s0, s4 ; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CHECK-NEXT: s_mov_b32 s1, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[0:1], v[0:1] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 4011c21af6904..661af021e8a84 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -192,10 +192,8 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) ; ; GISEL-LABEL: s_csh_v4i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s8, 31 -; GISEL-NEXT: s_mov_b32 s9, s8 -; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] -; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31 +; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31 ; GISEL-NEXT: s_lshl_b32 s8, s0, s4 ; GISEL-NEXT: s_lshl_b32 s9, s1, s5 ; GISEL-NEXT: s_lshl_b32 s10, s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir index 08693ec9db1d4..aeca4398f9a83 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir @@ -13,7 +13,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 1056964608, [[COPY]], [[COPY1]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %1:sreg_32 = COPY $sgpr1 @@ -33,7 +33,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %1:sreg_32 = COPY $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 492a30b67089c..bc49f70cbee11 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -742,10 +742,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_mov_b32 s1, s0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1 ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index b59f3c0d410f8..9b03a72fd826d 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} ; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}} define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}} define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -432,7 +432,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fma_v2_v_lit_splat: ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 ; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}} define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -556,8 +556,8 @@ bb: ; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}} define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { bb: %i12 = fadd <2 x float> zeroinitializer, %arg diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll index 81d792183dc06..debbfce7dadcc 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll @@ -218,7 +218,7 @@ define amdgpu_ps float @_amdgpu_ps_main() { ; GFX1150-NEXT: s_mov_b32 s3, s0 ; GFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX1150-NEXT: s_fmac_f32 s0, s1, 4.0 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog @@ -232,7 +232,7 @@ define amdgpu_ps float @_amdgpu_ps_main() { ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX12-NEXT: s_fmac_f32 s0, s1, 4.0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits