https://github.com/saxlungs updated https://github.com/llvm/llvm-project/pull/198670
>From 9a486ec8077cded34e3cb9d93628cb11eb848cff Mon Sep 17 00:00:00 2001 From: Domenic Nutile <[email protected]> Date: Fri, 1 May 2026 12:36:31 -0400 Subject: [PATCH 1/3] [AMDGPU][True16] Legalize extloads into 16-bit registers Signed-off-by: Domenic Nutile <[email protected]> --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 +- llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 118 ++++++++++++------ 2 files changed, 82 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 155294fae1781..b665421c69371 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -464,8 +464,8 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, MemSize = std::max(MemSize, Align); #endif - // Only 1-byte and 2-byte to 32-bit extloads are valid. - if (MemSize != RegSize && RegSize != 32) + // Only allow extloads to up to 32 bits. + if (MemSize != RegSize && RegSize > 32) return false; if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 2a07c3aa776d6..6ac69cc7e2f51 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2279,15 +2279,15 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % ; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-SDAG-TRUE16: ; %bb.0: @@ -2297,6 +2297,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % ; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2322,15 +2332,15 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p ; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-SDAG-TRUE16: ; %bb.0: @@ -2340,6 +2350,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p ; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2754,16 +2774,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % ; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-SDAG-TRUE16: ; %bb.0: @@ -2775,6 +2795,17 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % ; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2800,16 +2831,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p ; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-SDAG-TRUE16: ; %bb.0: @@ -2821,6 +2852,17 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p ; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 >From aa095dbeb51fa74705233606753ef4ed341fc544 Mon Sep 17 00:00:00 2001 From: Domenic Nutile <[email protected]> Date: Tue, 19 May 2026 16:24:25 -0400 Subject: [PATCH 2/3] Add legalize rules and fix tests --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 9 +- .../legalize-sextload-s16-true16.mir | 15 +- .../CodeGen/AMDGPU/GlobalISel/load-d16.ll | 228 +++++++++++++----- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 114 +++++++-- 4 files changed, 268 insertions(+), 98 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a5f0facadadce..e16c013ef27b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1121,29 +1121,34 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}}) .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}}) .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}}) - + .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}}); addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}}) + .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16) .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}}) + .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16) .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall) .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall) .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall) .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}}) + .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16) .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}}) .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}}) + .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16) .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall) .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall) .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall) - .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}}); + .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}}) + .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16); addRulesForGOpcs({G_STORE}) // addrspace(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir index 99f4418e7a978..399bf4ab9b764 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir @@ -13,9 +13,8 @@ body: | ; TRUE16-NEXT: {{ $}} ; TRUE16-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; TRUE16-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr2 - ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5) - ; TRUE16-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXTLOAD]](s32) - ; TRUE16-NEXT: G_STORE [[TRUNC]](s16), [[COPY]](p1) :: (store (s16), addrspace 1) + ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5) + ; TRUE16-NEXT: G_STORE [[SEXTLOAD]](s16), [[COPY]](p1) :: (store (s16), addrspace 1) ; ; FAKE16-LABEL: name: test_sextload_global_s16_from_s8 ; FAKE16: liveins: $vgpr0_vgpr1, $vgpr2 @@ -41,9 +40,8 @@ body: | ; TRUE16-NEXT: {{ $}} ; TRUE16-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; TRUE16-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr1 - ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5) - ; TRUE16-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXTLOAD]](s32) - ; TRUE16-NEXT: G_STORE [[TRUNC]](s16), [[COPY]](p3) :: (store (s16), addrspace 3) + ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5) + ; TRUE16-NEXT: G_STORE [[SEXTLOAD]](s16), [[COPY]](p3) :: (store (s16), addrspace 3) ; ; FAKE16-LABEL: name: test_sextload_local_s16_from_s8 ; FAKE16: liveins: $vgpr0, $vgpr1 @@ -69,9 +67,8 @@ body: | ; TRUE16-NEXT: {{ $}} ; TRUE16-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; TRUE16-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr1 - ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5) - ; TRUE16-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXTLOAD]](s32) - ; TRUE16-NEXT: G_STORE [[TRUNC]](s16), [[COPY]](p5) :: (store (s16), addrspace 5) + ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5) + ; TRUE16-NEXT: G_STORE [[SEXTLOAD]](s16), [[COPY]](p5) :: (store (s16), addrspace 5) ; ; FAKE16-LABEL: name: test_sextload_private_s16_from_s8 ; FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll index 166f439a61430..387944cf7811a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll @@ -29,12 +29,22 @@ define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra } define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { -; GFX12-LABEL: sextload_P0_i8_D16: -; GFX12: ; %bb.0: -; GFX12-NEXT: flat_load_d16_i8 v0, v[1:2] -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: flat_store_b32 v[3:4], v0 -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P0_i8_D16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: flat_load_d16_i8 v1, v[1:2] +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 +; GFX12-TRUE16-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P0_i8_D16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: flat_load_d16_i8 v0, v[1:2] +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(0) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 @@ -43,12 +53,23 @@ define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra } define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { -; GFX12-LABEL: sextload_P0_i8_D16_Hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: flat_load_d16_hi_i8 v0, v[1:2] -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: flat_store_b32 v[3:4], v0 -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P0_i8_D16_Hi: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: flat_load_d16_i8 v1, v[1:2] +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P0_i8_D16_Hi: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: flat_load_d16_hi_i8 v0, v[1:2] +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(0) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 @@ -111,12 +132,22 @@ define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra } define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { -; GFX12-LABEL: sextload_P1_i8_D16: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P1_i8_D16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 +; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P1_i8_D16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: global_load_d16_i8 v0, v[1:2], off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 @@ -125,12 +156,23 @@ define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra } define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { -; GFX12-LABEL: sextload_P1_i8_D16_Hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P1_i8_D16_Hi: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P1_i8_D16_Hi: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: global_load_d16_hi_i8 v0, v[1:2], off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 @@ -193,12 +235,22 @@ define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra } define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { -; GFX12-LABEL: sextload_P3_i8_D16: -; GFX12: ; %bb.0: -; GFX12-NEXT: ds_load_i8_d16 v0, v1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: ds_store_b32 v2, v0 -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P3_i8_D16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ds_load_i8_d16 v1, v1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 +; GFX12-TRUE16-NEXT: ds_store_b32 v2, v0 +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P3_i8_D16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: ds_load_i8_d16 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: ds_store_b32 v2, v0 +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 @@ -207,12 +259,23 @@ define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra } define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { -; GFX12-LABEL: sextload_P3_i8_D16_Hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: ds_load_i8_d16_hi v0, v1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: ds_store_b32 v2, v0 -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P3_i8_D16_Hi: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ds_load_i8_d16 v1, v1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: ds_store_b32 v2, v0 +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P3_i8_D16_Hi: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: ds_load_i8_d16_hi v0, v1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: ds_store_b32 v2, v0 +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 @@ -275,12 +338,22 @@ define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra } define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { -; GFX12-LABEL: sextload_P4_i8_D16: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P4_i8_D16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 +; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P4_i8_D16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: global_load_d16_i8 v0, v[1:2], off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 @@ -289,12 +362,23 @@ define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra } define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { -; GFX12-LABEL: sextload_P4_i8_D16_Hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P4_i8_D16_Hi: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P4_i8_D16_Hi: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: global_load_d16_hi_i8 v0, v[1:2], off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 @@ -357,12 +441,22 @@ define amdgpu_ps void @load_P5_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra } define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { -; GFX12-LABEL: sextload_P5_i8_D16: -; GFX12: ; %bb.0: -; GFX12-NEXT: scratch_load_d16_i8 v0, v1, off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: scratch_store_b32 v2, v0, off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P5_i8_D16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: scratch_load_d16_i8 v1, v1, off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 +; GFX12-TRUE16-NEXT: scratch_store_b32 v2, v0, off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P5_i8_D16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: scratch_load_d16_i8 v0, v1, off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: scratch_store_b32 v2, v0, off +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(5) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 @@ -371,12 +465,23 @@ define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra } define amdgpu_ps void @sextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { -; GFX12-LABEL: sextload_P5_i8_D16_Hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: scratch_load_d16_hi_i8 v0, v1, off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: scratch_store_b32 v2, v0, off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: sextload_P5_i8_D16_Hi: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: scratch_load_d16_i8 v1, v1, off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: scratch_store_b32 v2, v0, off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sextload_P5_i8_D16_Hi: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: scratch_load_d16_hi_i8 v0, v1, off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: scratch_store_b32 v2, v0, off +; GFX12-FAKE16-NEXT: s_endpgm %a = load i8, ptr addrspace(5) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 @@ -411,6 +516,3 @@ define amdgpu_ps void @zextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %p store <2 x i16> %res, ptr addrspace(5) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-FAKE16: {{.*}} -; GFX12-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 9e4c6e6935596..2c16351a9bb4d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4203,12 +4203,28 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX12-GISEL-TRUE16: ; %bb.0: +; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3] +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX12-GISEL-FAKE16: ; %bb.0: +; GFX12-GISEL-FAKE16-NEXT: global_load_d16_i8 v1, v0, s[2:3] +; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -4233,12 +4249,28 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX12-GISEL-TRUE16: ; %bb.0: +; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX12-GISEL-FAKE16: ; %bb.0: +; GFX12-GISEL-FAKE16-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 +; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -4596,12 +4628,29 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX12-GISEL-TRUE16: ; %bb.0: +; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3] +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX12-GISEL-FAKE16: ; %bb.0: +; GFX12-GISEL-FAKE16-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] +; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -4626,12 +4675,29 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX12-GISEL-TRUE16: ; %bb.0: +; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX12-GISEL-FAKE16: ; %bb.0: +; GFX12-GISEL-FAKE16-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 +; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 >From b075400f070477d63df1cfaa320f8c7c39478aab Mon Sep 17 00:00:00 2001 From: Domenic Nutile <[email protected]> Date: Thu, 21 May 2026 11:14:47 -0400 Subject: [PATCH 3/3] Update comment around destination reg size for clarity --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b665421c69371..5ed7255a97c6f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -464,7 +464,11 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, MemSize = std::max(MemSize, Align); #endif - // Only allow extloads to up to 32 bits. + // We want to allow extending loads into up to a 32-bit destination register. + // However, this would potentially allow 16-bit destinations even without + // True16. This function is used by isLoadStoreLegal, which will also call + // isRegisterType on the destination register type which will disallow + // 16-bit types without True16, so this is safe. if (MemSize != RegSize && RegSize > 32) return false; _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
