================ @@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, + Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { + if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { + Register Reg = Inst->getOperand(0).getReg(); + if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || + ILMA.isS32S64LaneMask(Reg)) + continue; + + MachineBasicBlock *MBB = Inst->getParent(); + B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator()))); + + Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); + B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) + .addUse(ExecReg, RegState::Implicit); + + replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg); + } + return false; +} ---------------- nhaehnle wrote:
I do have one high-level comment about this. Every `Inst` may potentially appear with many `UseInst`s in the temporal divergence list. The current code will create multiple new registers and multiple `COPY` instructions, which seems wasteful even if downstream passes can often clean it up. I would suggest capturing the created register in a `DenseMap<Instruction *, Register>` for re-use. Also, how about inserting the `COPY` at the end of `Inst->getParent()`? That way, the live range of the VGPR is reduced. https://github.com/llvm/llvm-project/pull/124298 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits