================ @@ -1200,16 +1200,61 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { - for (const auto &CI : IA->ParseConstraints()) { +/// Compute the minimum number of AGPRs required to allocate the inline asm. +static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, + const CallBase &Call) { + unsigned ArgNo = 0; + unsigned ResNo = 0; + unsigned AGPRDefCount = 0; + unsigned AGPRUseCount = 0; + unsigned MaxPhysReg = 0; + const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout(); + + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + Type *Ty = nullptr; + switch (CI.Type) { + case InlineAsm::isOutput: { + Ty = Call.getType(); + if (auto *STy = dyn_cast<StructType>(Ty)) + Ty = STy->getElementType(ResNo); + ++ResNo; + break; + } + case InlineAsm::isInput: { + Ty = Call.getArgOperand(ArgNo++)->getType(); + break; + } + case InlineAsm::isLabel: + continue; + case InlineAsm::isClobber: + // Parse the physical register reference. + break; + } + for (StringRef Code : CI.Codes) { - Code.consume_front("{"); - if (Code.starts_with("a")) - return true; + if (Code.starts_with("a")) { + // Virtual register, compute number of registers based on the type. + // + // We ought to be going through TargetLowering to get the number of + // registers, but we should avoid the dependence on CodeGen here. + unsigned RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32); + if (CI.Type == InlineAsm::isOutput) { + AGPRDefCount += RegCount; + if (CI.isEarlyClobber) + AGPRUseCount += RegCount; + } else + AGPRUseCount += RegCount; + } else { + // Physical register reference + auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code); + if (Kind == 'a') + MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u)); + } } } - return false; + unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); + return std::min(MaxVirtReg + MaxPhysReg, 256u); ---------------- ritter-x2a wrote:
For this code ``` define amdgpu_kernel void @foo() { call void asm sideeffect "; use $0, $1, $2", "{a16},a,a"(i32 17, <8 x i32> splat (i32 1), <16 x i32> splat (i32 2)) ret void } ``` we allocate `; use a16, a[18:25], a[0:15]`, so the asm uses 25 AGPRs (arguably 26 since `a25` is used and `a17` is left out, not sure why it's not allocated as `a[17:24]`, I'm not aware of alignment requirements for AGPRs). This function computes 17 (the highest required physical register index + 1) + 24 (the number of virtual registers required) = 41 AGPRs required. This over-approximation seems worth pointing out in a comment, if it's intended. https://github.com/llvm/llvm-project/pull/150910 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits