https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/170657
>From 5c91afe00979438edfb664dfa69f22fff71c655d Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Thu, 4 Dec 2025 12:42:14 +0100 Subject: [PATCH] AMDGPU: Avoid crashing on statepoint-like pseudoinstructions At the moment the MIR tests are somewhat redundant. The waitcnt one is needed to ensure we actually have a load, given we are currently just emitting an error on ExternalSymbol. The asm printer one is more redundant for the moment, since it's stressed by the IR test. However I am planning to change the error path for the IR test, so it will soon not be redundant. --- llvm/include/llvm/CodeGen/TargetInstrInfo.h | 13 +++- .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 11 ++++ .../SelectionDAG/StatepointLowering.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 10 +++ .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 5 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 12 ++++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 + llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + llvm/test/CodeGen/AMDGPU/llvm.deoptimize.ll | 16 +++++ .../CodeGen/AMDGPU/statepoint-asm-printer.mir | 40 ++++++++++++ .../AMDGPU/statepoint-insert-waitcnts.mir | 64 +++++++++++++++++++ 13 files changed, 184 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.deoptimize.ll create mode 100644 llvm/test/CodeGen/AMDGPU/statepoint-asm-printer.mir create mode 100644 llvm/test/CodeGen/AMDGPU/statepoint-insert-waitcnts.mir diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 18142c2c0adf3..bdd9fee795e08 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -2350,7 +2350,18 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// Returns the callee operand from the given \p MI. virtual const MachineOperand &getCalleeOperand(const MachineInstr &MI) const { - return MI.getOperand(0); + assert(MI.isCall()); + + switch (MI.getOpcode()) { + case TargetOpcode::STATEPOINT: + case TargetOpcode::STACKMAP: + case TargetOpcode::PATCHPOINT: + return MI.getOperand(3); + default: + return MI.getOperand(0); + } + + llvm_unreachable("impossible call instruction"); } /// Return the uniformity behavior of the given instruction. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index dd8f18d3b8a6a..7998da0ea06eb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -331,6 +331,17 @@ namespace llvm { MachineBasicBlock * TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { + switch (MI.getOpcode()) { + case TargetOpcode::STATEPOINT: + // As an implementation detail, STATEPOINT shares the STACKMAP format at + // this point in the process. We diverge later. + case TargetOpcode::STACKMAP: + case TargetOpcode::PATCHPOINT: + return emitPatchPoint(MI, MBB); + default: + break; + } + #ifndef NDEBUG dbgs() << "If a target marks an instruction with " "'usesCustomInserter', it must implement " diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 46a5e44374e1c..5b8cd343557fa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -1145,6 +1145,8 @@ void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl( const CallBase *Call, SDValue Callee, const BasicBlock *EHPadBB, bool VarArgDisallowed, bool ForceVoidReturnTy) { StatepointLoweringInfo SI(DAG); + SI.CLI.CB = Call; + unsigned ArgBeginIndex = Call->arg_begin() - Call->op_begin(); populateCallLoweringInfo( SI.CLI, Call, ArgBeginIndex, Call->arg_size(), Callee, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index bf9b4297bd435..99c1ab8d379d5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -406,6 +406,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + unsigned Opc = MI->getOpcode(); + if (LLVM_UNLIKELY(Opc == TargetOpcode::STATEPOINT || + Opc == TargetOpcode::STACKMAP || + Opc == TargetOpcode::PATCHPOINT)) { + LLVMContext &Ctx = MI->getMF()->getFunction().getContext(); + Ctx.emitError("unhandled statepoint-like instruction"); + OutStreamer->emitRawComment("unsupported statepoint/stackmap/patchpoint"); + return; + } + if (isVerbose()) if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode())) emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index b03d50f2d451d..4e664e084fb88 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -256,10 +256,13 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( // Pseudo used just to encode the underlying global. Is there a better // way to track this? + // TODO: Some of the generic call-like pseudos do not encode the callee, + // so we overly conservatively treat this as an indirect call. const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - const Function *Callee = getCalleeFunction(*CalleeOp); + const Function *Callee = + CalleeOp ? getCalleeFunction(*CalleeOp) : nullptr; auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 301f2fc8dab45..0660e3a64c70e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -275,6 +275,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); + setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -6838,6 +6839,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); return LowerGlobalAddress(MFI, Op, DAG); } + case ISD::ExternalSymbol: + return LowerExternalSymbol(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: @@ -9017,6 +9020,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, MachineMemOperand::MOInvariant); } +SDValue SITargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + // TODO: Handle this. It should be mostly the same as LowerGlobalAddress. + const Function &Fn = DAG.getMachineFunction().getFunction(); + DAG.getContext()->diagnose(DiagnosticInfoUnsupported( + Fn, "unsupported external symbol", Op.getDebugLoc())); + return DAG.getPOISON(Op.getValueType()); +} + SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const { // We can't use S_MOV_B32 directly, because there is no way to specify m0 as diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index fb162948caf4c..e82f4528fcd09 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -79,6 +79,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 79c3394b2df50..6d8343dd411d0 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1955,7 +1955,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // load). We also need to check WAW dependency with saved PC. Wait = AMDGPU::Waitcnt(); - const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI); if (CallAddrOp.isReg()) { RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval(&MI, CallAddrOp); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7535407741f1f..ef011236642f8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10510,6 +10510,14 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return SchedModel.computeInstrLatency(&MI); } +const MachineOperand & +SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const { + if (const MachineOperand *CallAddrOp = + getNamedOperand(MI, AMDGPU::OpName::src0)) + return *CallAddrOp; + return TargetInstrInfo::getCalleeOperand(MI); +} + InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index b1d6563bf3c0b..7d7cbda644b77 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1641,6 +1641,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const MachineInstr &MI, unsigned *PredCost = nullptr) const override; + const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override; + InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.deoptimize.ll b/llvm/test/CodeGen/AMDGPU/llvm.deoptimize.ll new file mode 100644 index 0000000000000..ef8c00bb3b4b0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.deoptimize.ll @@ -0,0 +1,16 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2> %t.err | FileCheck %s +; RUN: FileCheck -check-prefix=ERR %s < %t.err + +; ERR: error: <unknown>:0:0: in function caller_0 i32 (): unsupported external symbol +; ERR: error: unhandled statepoint-like instruction + +; CHECK: ;unsupported statepoint/stackmap/patchpoint +declare i32 @llvm.experimental.deoptimize.i32(...) +declare i8 @llvm.experimental.deoptimize.i8(...) + +define i32 @caller_0() { +entry: + %v = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ] + ret i32 %v +} + diff --git a/llvm/test/CodeGen/AMDGPU/statepoint-asm-printer.mir b/llvm/test/CodeGen/AMDGPU/statepoint-asm-printer.mir new file mode 100644 index 0000000000000..1c9ae91557b7e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/statepoint-asm-printer.mir @@ -0,0 +1,40 @@ +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-asm-printer -o - %s 2> %t.err | FileCheck %s +# RUN: FileCheck -check-prefix=ERR %s < %t.err + +# CHECK: ;unsupported statepoint/stackmap/patchpoint +# ERR: error: unhandled statepoint-like instruction + +--- +name: test_statepoint +tracksRegLiveness: true +frameInfo: + stackSize: 16 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + maxCallFrameSize: 0 + isCalleeSavedInfoValid: true +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } +machineFunctionInfo: + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0.entry: + liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + + S_WAITCNT 0 + $sgpr16 = S_MOV_B32 $sgpr33 + $sgpr33 = S_MOV_B32 $sgpr32 + $sgpr18_sgpr19 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + $exec = S_MOV_B64 killed $sgpr18_sgpr19 + $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 + $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40 + $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 1024, implicit-def dead $scc + $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40 + STATEPOINT 2882400015, 0, 11, undef renamable $sgpr4_sgpr5, 0, killed $sgpr4_sgpr5, killed $sgpr6_sgpr7, killed $sgpr8_sgpr9, killed $sgpr10_sgpr11, killed $sgpr12, killed $sgpr13, killed $sgpr14, killed $sgpr15, killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, csr_amdgpu +... + diff --git a/llvm/test/CodeGen/AMDGPU/statepoint-insert-waitcnts.mir b/llvm/test/CodeGen/AMDGPU/statepoint-insert-waitcnts.mir new file mode 100644 index 0000000000000..00b1f6e33412f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/statepoint-insert-waitcnts.mir @@ -0,0 +1,64 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s + +# Make sure the waitcnt pass doesn't crash on statepoint +# pseudoinstructions, and handlest he wait for the callee operand +# correctly. + +--- +name: test_wait_statepoint_callee +tracksRegLiveness: true +frameInfo: + stackSize: 16 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + maxCallFrameSize: 0 + isCalleeSavedInfoValid: true +stack: + - { id: 0, offset: 4, size: 4, alignment: 4 } + - { id: 1, type: spill-slot, size: 4, alignment: 4 } + - { id: 2, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + spillPhysVGPRs: + - '$vgpr40' + wwmReservedRegs: + - '$vgpr40' + scavengeFI: '%stack.0' +body: | + bb.0: + liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13 + + ; CHECK-LABEL: name: test_wait_statepoint_callee + ; CHECK: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_WAITCNT 0 + ; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32 + ; CHECK-NEXT: $sgpr18_sgpr19 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr18_sgpr19 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40 + ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 1024, implicit-def dead $scc + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40 + ; CHECK-NEXT: $sgpr14_sgpr15 = S_LOAD_DWORDX2_IMM $sgpr12_sgpr13, 0, 0 + ; CHECK-NEXT: S_WAITCNT 49279 + ; CHECK-NEXT: STATEPOINT 2882400015, 0, 11, renamable $sgpr14_sgpr15, 0, killed $sgpr4_sgpr5, killed $sgpr6_sgpr7, killed $sgpr8_sgpr9, killed $sgpr10_sgpr11, killed $sgpr12, killed $sgpr13, killed $sgpr14, killed $sgpr15, killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, csr_amdgpu + $sgpr16 = S_MOV_B32 $sgpr33 + $sgpr33 = S_MOV_B32 $sgpr32 + $sgpr18_sgpr19 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + $exec = S_MOV_B64 killed $sgpr18_sgpr19 + $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 + $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40 + $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 1024, implicit-def dead $scc + $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40 + $sgpr14_sgpr15 = S_LOAD_DWORDX2_IMM $sgpr12_sgpr13, 0, 0 + STATEPOINT 2882400015, 0, 11, renamable $sgpr14_sgpr15, 0, killed $sgpr4_sgpr5, killed $sgpr6_sgpr7, killed $sgpr8_sgpr9, killed $sgpr10_sgpr11, killed $sgpr12, killed $sgpr13, killed $sgpr14, killed $sgpr15, killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, csr_amdgpu + +... _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
