https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/140582
We need to consider the use instruction's intepretation of the bits, not the defined immediate without use context. This will regress some cases where we previously coud match f64 inline constants. We can restore them by either using pseudo instructions to materialize f64 constants, or recognizing reg_sequence decomposed into 32-bit pieces for them (which essentially means recognizing every other input is a 0). Fixes #139908 >From 9ae69332688f4864b25449a694dc67968a1bf45b Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 16 May 2025 17:21:39 +0200 Subject: [PATCH] AMDGPU: Check for subreg match when folding through reg_sequence We need to consider the use instruction's intepretation of the bits, not the defined immediate without use context. This will regress some cases where we previously coud match f64 inline constants. We can restore them by either using pseudo instructions to materialize f64 constants, or recognizing reg_sequence decomposed into 32-bit pieces for them (which essentially means recognizing every other input is a 0). Fixes #139908 --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 48 ++++- llvm/test/CodeGen/AMDGPU/constrained-shift.ll | 6 +- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 5 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 14 +- llvm/test/CodeGen/AMDGPU/operand-folding.ll | 4 +- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 198 ++++++++++++++++-- .../AMDGPU/si-fold-operands-subreg-imm.mir | 2 +- 7 files changed, 242 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 92937e33fd500..d81f25c57af60 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { if (!SrcRC) return {}; + // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize + // every other other element is 0 for 64-bit immediates) int64_t Imm; for (unsigned I = 0, E = Defs.size(); I != E; ++I) { const MachineOperand *Op = Defs[I].first; @@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return nullptr; - // FIXME: Verify SplatRC is compatible with the use operand - uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; - if (!TII->isInlineConstant(*SplatVal, OpTy) || - !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal)) + int16_t RCID = Desc.operands()[UseOpIdx].RegClass; + if (RCID == -1) + return nullptr; + + // Special case 0/-1, since when interpreted as a 64-bit element both halves + // have the same bits. Effectively this code does not handle 64-bit element + // operands correctly, as the incoming 64-bit constants are already split into + // 32-bit sequence elements. + // + // TODO: We should try to figure out how to interpret the reg_sequence as a + // split 64-bit splat constant, or use 64-bit pseudos for materializing f64 + // constants. + if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) { + const TargetRegisterClass *OpRC = TRI->getRegClass(RCID); + // We need to figure out the scalar type read by the operand. e.g. the MFMA + // operand will be AReg_128, and we want to check if it's compatible with an + // AReg_32 constant. + uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; + switch (OpTy) { + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0); + break; + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1); + break; + default: + return nullptr; + } + + if (!TRI->getCommonSubClass(OpRC, SplatRC)) + return nullptr; + } + + if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal)) return nullptr; return SplatVal; @@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand( } } - if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; - if (RSUse->getSubReg() != RegSeqDstSubReg) + if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) continue; - foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList, - CopiesToReplace); + foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace); } return; diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index af4ca2ad7120a..fb53e889b1158 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) ; ; GISEL-LABEL: s_csh_v4i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31 -; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31 +; GISEL-NEXT: s_mov_b32 s8, 31 +; GISEL-NEXT: s_mov_b32 s9, s8 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] ; GISEL-NEXT: s_lshl_b32 s8, s0, s4 ; GISEL-NEXT: s_lshl_b32 s9, s1, s5 ; GISEL-NEXT: s_lshl_b32 s10, s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 28245c538a04c..d588f0e0897b7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1 +; GFX12-SDAG-NEXT: s_mov_b32 s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_mov_b32 s1, s0 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 86bfb694ab255..5d5dc01439fe4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -262,11 +262,19 @@ bb: ret void } -; FIXME: This should not be foldable as an inline immediate ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low: -; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}} +; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}} +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]] + +; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}} ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}} +; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}} ; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0] ; GCN: global_store_dwordx4 ; GCN: global_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll index ebfc5d02134c5..778d73fd919fc 100644 --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) { ; CHECK-LABEL: issue139908: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 42, v[0:1] +; CHECK-NEXT: s_mov_b32 s4, 42 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e64 v0, 2, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %eq = icmp eq i64 %in, 180388626474 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index ddc3e770767b8..bef38c1a65ef8 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ; GFX900-LABEL: fadd_v2_vv: @@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 4.0 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0] ; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fma_v2_v_imm: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX90A-GISEL-NEXT: s_mov_b32 s4, 0x43480000 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fma_v2_v_imm: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x43480000 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX942-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm ; -; PACKED-GISEL-LABEL: fma_v2_v_lit_splat: -; PACKED-GISEL: ; %bb.0: -; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0 -; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 -; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3] -; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; PACKED-GISEL-NEXT: s_endpgm +; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fma_v2_v_lit_splat: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX942-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] ; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX90A-GISEL-NEXT: s_mov_b32 s5, 2.0 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0 +; GFX942-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0: +; GFX90A-GISEL: ; %bb.0: ; %bb +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fadd_fadd_fsub_0: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX942-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> zeroinitializer, %arg %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> @@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] ; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fadd_fadd_fsub: +; GFX90A-GISEL: ; %bb.0: ; %bb +; GFX90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX90A-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fadd_fadd_fsub: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> @@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90A-SDAG: {{.*}} +; GFX942-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir index aa1a7441bc477..8d6c3efb5ded5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir @@ -140,7 +140,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], 42, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], [[REG_SEQUENCE]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0, implicit $scc %0:sgpr_64 = COPY $sgpr8_sgpr9 %1:sreg_32 = S_MOV_B32 42 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits