https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/178608
>From 0b11a343e00706e2a207c35f3aac2321d73de60b Mon Sep 17 00:00:00 2001 From: Ruiling Song <[email protected]> Date: Thu, 29 Jan 2026 14:36:01 +0800 Subject: [PATCH 1/3] [AMDGPU] Support one immediate folding for global load The address calculation may happen on i32 and be sign extended to the i64 offset. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 30 +++++++++- .../AMDGPU/AMDGPUInstructionSelector.cpp | 59 +++++++++++++++---- .../CodeGen/AMDGPU/load-saddr-offset-imm.ll | 16 ++--- 3 files changed, 80 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 1446c84ef733b..1d6a7b4988528 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1981,6 +1981,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, bool NeedIOffset) const { int64_t ImmOffset = 0; ScaleOffset = false; + const SIInstrInfo *TII = Subtarget->getInstrInfo(); // Match the immediate offset first, which canonically is moved as low as // possible. @@ -1988,7 +1989,6 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue LHS, RHS; if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); if (NeedIOffset && TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, @@ -2037,13 +2037,37 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, LHS = Addr.getOperand(0); if (!LHS->isDivergent()) { - // add (i64 sgpr), (*_extend (i32 vgpr)) RHS = Addr.getOperand(1); - ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtRHS = matchExtFromI32orI32( RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { + // add (i64 sgpr), (*_extend (scale (i32 vgpr))) SAddr = LHS; VOffset = ExtRHS; + if (NeedIOffset && !ImmOffset && + CurDAG->isBaseWithConstantOffset(ExtRHS)) { + // add (i64 sgpr), (*_extend (add (scale (i32 vgpr)), (i32 imm))) + int64_t COffset = + cast<ConstantSDNode>(ExtRHS.getOperand(1))->getSExtValue(); + if (TII->isLegalFLATOffset(COffset, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal)) { + VOffset = ExtRHS.getOperand(0); + ImmOffset = + cast<ConstantSDNode>(ExtRHS.getOperand(1))->getSExtValue(); + } + } + + ScaleOffset = + SelectScaleOffset(N, VOffset, Subtarget->hasSignedGVSOffset()); + } else { + // add (i64 sgpr), (scale (*_extend (i32 vgpr))) + ScaleOffset = + SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtRHS = matchExtFromI32orI32( + RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { + SAddr = LHS; + VOffset = ExtRHS; + } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e239e6f56cb44..6fdcca3443588 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5843,24 +5843,59 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. - bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset, - Subtarget->hasSignedGVSOffset()); if (Register VOffset = matchExtendFromS32OrS32( PtrBaseOffset, Subtarget->hasSignedGVSOffset())) { + if (NeedIOffset && !ImmOffset) { + MachineInstr *VOffsetDef = getDefIgnoringCopies(VOffset, *MRI); + if (VOffsetDef->getOpcode() == TargetOpcode::G_ADD) { + Register RHS = VOffsetDef->getOperand(2).getReg(); + std::optional<ValueAndVReg> RHSValReg = + getIConstantVRegValWithLookThrough(RHS, *MRI); + if (RHSValReg && + TII.isLegalFLATOffset(RHSValReg->Value.getSExtValue(), + AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal)) { + VOffset = VOffsetDef->getOperand(1).getReg(); + ImmOffset = RHSValReg->Value.getSExtValue(); + } + } + } + + bool ScaleOffset = + selectScaleOffset(Root, VOffset, Subtarget->hasSignedGVSOffset()); if (NeedIOffset) - return {{[=](MachineInstrBuilder &MIB) { // saddr - MIB.addReg(SAddr); - }, - [=](MachineInstrBuilder &MIB) { // voffset - MIB.addReg(VOffset); - }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(ImmOffset); - }, - [=](MachineInstrBuilder &MIB) { // cpol + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits | (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); }}}; + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); + }}}; + } else { + bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset, + Subtarget->hasSignedGVSOffset()); + if (Register VOffset = matchExtendFromS32OrS32( + PtrBaseOffset, Subtarget->hasSignedGVSOffset())) + if (NeedIOffset) + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); + }}}; return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); }, diff --git a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll index a1e229d09b777..fd26847d83cf8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll @@ -10,8 +10,8 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; @@ -19,9 +19,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; @@ -29,8 +27,8 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog ; @@ -38,9 +36,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 scale_offset ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %v = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) >From 1da76c3735d9741b62f7724ead42a8eaa01bd674 Mon Sep 17 00:00:00 2001 From: Ruiling Song <[email protected]> Date: Thu, 29 Jan 2026 22:17:43 +0800 Subject: [PATCH 2/3] [AMDGPU] add back missing parenthesis --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 6fdcca3443588..fe9ac1bed1741 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5881,7 +5881,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset, Subtarget->hasSignedGVSOffset()); if (Register VOffset = matchExtendFromS32OrS32( - PtrBaseOffset, Subtarget->hasSignedGVSOffset())) + PtrBaseOffset, Subtarget->hasSignedGVSOffset())) { if (NeedIOffset) return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); @@ -5896,16 +5896,17 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, MIB.addImm(CPolBits | (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); }}}; - return {{[=](MachineInstrBuilder &MIB) { // saddr - MIB.addReg(SAddr); - }, - [=](MachineInstrBuilder &MIB) { // voffset - MIB.addReg(VOffset); - }, - [=](MachineInstrBuilder &MIB) { // cpol - MIB.addImm(CPolBits | - (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); - }}}; + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); + }}}; + } } } } >From 068c3d289ab29c7493c14e8dfd5be2c718cb9c28 Mon Sep 17 00:00:00 2001 From: Ruiling Song <[email protected]> Date: Thu, 29 Jan 2026 22:23:37 +0800 Subject: [PATCH 3/3] [AMDGPU] Address review comment --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 1d6a7b4988528..e8c722bd6f500 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2052,8 +2052,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, if (TII->isLegalFLATOffset(COffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { VOffset = ExtRHS.getOperand(0); - ImmOffset = - cast<ConstantSDNode>(ExtRHS.getOperand(1))->getSExtValue(); + ImmOffset = COffset; } } _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
