https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/153178
>From 441b8928a2a4ad8ee987d38c6393c126ea59acb7 Mon Sep 17 00:00:00 2001 From: Petar Avramovic <petar.avramo...@amd.com> Date: Tue, 12 Aug 2025 14:22:24 +0200 Subject: [PATCH] AMDGPU/GlobalISel: Import D16 load patterns and add combines for them Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them, this will import D16 load patterns to global-isel's tablegened instruction selector. For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16 in AMDGPURegBankCombiner. --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 11 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 7 + .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 99 +++++ llvm/lib/Target/AMDGPU/SIInstructions.td | 15 + .../AMDGPU/GlobalISel/atomic_load_flat.ll | 15 +- .../AMDGPU/GlobalISel/atomic_load_global.ll | 15 +- .../AMDGPU/GlobalISel/atomic_load_local_2.ll | 13 +- .../CodeGen/AMDGPU/GlobalISel/load-d16.ll | 412 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 246 +++-------- 9 files changed, 637 insertions(+), 196 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b5dac95b57a2d..acd1277c07902 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -71,6 +71,14 @@ def int_minmax_to_med3 : GICombineRule< [{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>; +def d16_matchdata : GIDefMatchData<"D16MatchInfo">; + +def d16_load : GICombineRule< + (defs root:$bitcast, d16_matchdata:$matchinfo), + (match (wip_match_opcode G_BITCAST):$bitcast, + [{ return matchD16Load(*${bitcast}, ${matchinfo}); }]), + (apply [{ applyD16Load(*${bitcast}, ${matchinfo}); }])>; + def fp_minmax_to_med3 : GICombineRule< (defs root:$min_or_max, med3_matchdata:$matchinfo), (match (wip_match_opcode G_FMAXNUM, @@ -219,5 +227,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + d16_load]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 394a143dd3086..a4ccf368f7745 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -309,6 +309,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>; +def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>; +def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>; +def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>; +def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>; +def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>; +def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>; + def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>; // G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return, // so we don't mark it as equivalent. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..7e16729657411 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -74,6 +74,12 @@ class AMDGPURegBankCombinerImpl : public Combiner { Register Val0, Val1, Val2; }; + struct D16MatchInfo { + MachineInstr *Load; + unsigned Opc; + Register Dst; + }; + MinMaxMedOpc getMinMaxPair(unsigned Opc) const; template <class m_Cst, typename CstTy> @@ -89,6 +95,9 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool matchD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const; + void applyD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +401,96 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI, + D16MatchInfo &MatchInfo) const { + if (!STI.d16PreservesUnusedBits()) + return false; + + Register Dst; + MachineInstr *Load, *SextLoad; + const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000; + const int64_t CleanHi16 = 0x000000000000FFFF; + + // Load lo + if (mi_match(MI.getOperand(1).getReg(), MRI, + m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)), + m_Copy(m_SpecificICst(CleanLo16))), + m_MInstr(Load)))) { + + if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) { + const MachineMemOperand *MMO = *Load->memoperands_begin(); + unsigned LoadSize = MMO->getSizeInBits().getValue(); + if (LoadSize == 8) { + MatchInfo = {Load, AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, Dst}; + } else if (LoadSize == 16) { + MatchInfo = {Load, AMDGPU::G_AMDGPU_LOAD_D16_LO, Dst}; + } else + return false; + return true; + } + + if (mi_match( + Load, MRI, + m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) { + if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD) + return false; + + const MachineMemOperand *MMO = *SextLoad->memoperands_begin(); + if (MMO->getSizeInBits().getValue() != 8) + return false; + + MatchInfo = {SextLoad, AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, Dst}; + return true; + } + + return false; + } + + // Load hi + if (mi_match(MI.getOperand(1).getReg(), MRI, + m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)), + m_Copy(m_SpecificICst(CleanHi16))), + m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) { + + if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) { + const MachineMemOperand *MMO = *Load->memoperands_begin(); + unsigned LoadSize = MMO->getSizeInBits().getValue(); + if (LoadSize == 8) { + MatchInfo = {Load, AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, Dst}; + } else if (LoadSize == 16) { + MatchInfo = {Load, AMDGPU::G_AMDGPU_LOAD_D16_HI, Dst}; + } else + return false; + return true; + } + + if (mi_match( + Load, MRI, + m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) { + if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD) + return false; + const MachineMemOperand *MMO = *SextLoad->memoperands_begin(); + if (MMO->getSizeInBits().getValue() != 8) + return false; + + MatchInfo = {SextLoad, AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, Dst}; + return true; + } + + return false; + } + + return false; +} + +void AMDGPURegBankCombinerImpl::applyD16Load(MachineInstr &MI, + D16MatchInfo &MatchInfo) const { + B.buildInstr(MatchInfo.Opc, {MI.getOperand(0).getReg()}, + {MatchInfo.Load->getOperand(1).getReg(), MatchInfo.Dst}) + .setMemRefs(MatchInfo.Load->memoperands()); + MI.eraseFromParent(); +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo<SIMachineFunctionInfo>()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e8b4501226732..56cc324aecc13 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -4251,6 +4251,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; +class D16LoadGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins ptype1:$addr); + let hasSideEffects = 0; + let mayLoad = 1; +} + +def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction; +def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction; +def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction; +def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction; +def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction; +def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction; + + class BufferStoreGenericInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll index 97694f3304431..d03bbdecf84ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll @@ -186,11 +186,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr, ; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr %ptr monotonic, align 2 %insert = insertelement <2 x i16> %vec, i16 %load, i32 1 @@ -260,10 +260,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr, ; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr %ptr monotonic, align 2 %insert = insertelement <2 x i16> %vec, i16 %load, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll index 5d902d5ec98ab..a8def6e6f6e92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll @@ -519,11 +519,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_hi_vector_insert(ptr addr ; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 %insert = insertelement <2 x i16> %vec, i16 %load, i32 1 @@ -622,10 +622,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_lo_vector_insert(ptr addr ; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 %insert = insertelement <2 x i16> %vec, i16 %load, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll index 31cdbbe1c4d73..fc7eafbbcdc77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll @@ -400,11 +400,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_hi_vector_insert(ptr addrs ; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_u16 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: ds_read_u16 v2, v0 +; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 %insert = insertelement <2 x i16> %vec, i16 %load, i32 1 @@ -478,10 +477,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_lo_vector_insert(ptr addrs ; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_u16 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: ds_read_u16 v2, v0 +; GFX9-NEXT: ds_read_u16_d16 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 %insert = insertelement <2 x i16> %vec, i16 %load, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll new file mode 100644 index 0000000000000..5d5b752b27a3c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s + +define amdgpu_ps void @load_P0_B16_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_P0_B16_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_d16_b16 v0, v[1:2] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(0) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 0 + store <2 x i16> %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_P0_B16_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_d16_hi_b16 v0, v[1:2] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(0) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 1 + store <2 x i16> %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: sextload_P0_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_d16_i8 v0, v[1:2] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: sextload_P0_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_d16_hi_i8 v0, v[1:2] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @zextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: zextload_P0_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_d16_u8 v0, v[1:2] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @zextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: zextload_P0_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_d16_hi_u8 v0, v[1:2] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @load_P1_B16_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_P1_B16_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 0 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_P1_B16_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 1 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P1_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P1_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P1_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P1_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_P3_B16_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_P3_B16_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u16_d16 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v2, v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(3) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 0 + store <2 x i16> %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_P3_B16_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v2, v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(3) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 1 + store <2 x i16> %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: sextload_P3_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_i8_d16 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v2, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: sextload_P3_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_i8_d16_hi v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v2, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @zextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: zextload_P3_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u8_d16 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v2, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @zextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: zextload_P3_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u8_d16_hi v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v2, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_P4_B16_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_P4_B16_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 0 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_P4_B16_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 1 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P4_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P4_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P4_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P4_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_P5_B16_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_P5_B16_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_d16_b16 v0, v1, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v2, v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(5) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 0 + store <2 x i16> %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @load_P5_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_P5_B16_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_d16_hi_b16 v0, v1, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v2, v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(5) %ptra + %res = insertelement <2 x i16> %vec, i16 %a, i32 1 + store <2 x i16> %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: sextload_P5_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_d16_i8 v0, v1, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v2, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @sextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: sextload_P5_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_d16_hi_i8 v0, v1, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v2, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + %a16 = sext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @zextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: zextload_P5_i8_D16: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_d16_u8 v0, v1, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v2, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 + store <2 x i16> %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @zextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: zextload_P5_i8_D16_Hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_d16_hi_u8 v0, v1, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v2, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + %a16 = zext i8 %a to i16 + %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 + store <2 x i16> %res, ptr addrspace(5) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 1602e31d6147c..9af4eae3413a7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4089,19 +4089,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_b16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -4125,19 +4118,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr ad ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -4162,19 +4148,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -4199,19 +4178,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -4237,21 +4209,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -4276,21 +4239,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -4492,21 +4446,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -4530,21 +4475,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr ad ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -4569,21 +4505,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -4608,21 +4535,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -4648,22 +4566,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -4688,22 +4596,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits