================ @@ -0,0 +1,265 @@ +//===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds +// extra implicit-def's to instructions that define the low N bits of +// a GPR/FPR register to also define the top bits, because all AArch64 +// instructions that write the low bits of a GPR/FPR also implicitly zero +// the top bits. For example, 'mov w0, w1' writes zeroes to the top 32-bits of +// x0, so this pass adds a `implicit-def $x0` after register allocation. +// +// These semantics are originally represented in the MIR using `SUBREG_TO_REG` +// which expresses that the top bits have been defined by the preceding +// instructions, but during register coalescing this information is lost and in +// contrast to when SRTL is disabled, when rewriting virtual -> physical +// registers the implicit-defs are not added to the instruction. +// +// There have been several attempts to fix this in the coalescer [1], but each +// iteration has exposed new bugs and the patch had to be reverted. +// Additionally, the concept of adding 'implicit-def' of a virtual register is +// particularly fragile and many places don't expect it (for example in +// `X86::commuteInstructionImpl` the code only looks at specific operands and +// does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where +// it traverses operand 'defs' rather than 'all_defs'). +// +// We want a temporary solution that doesn't impact other targets and is simpler +// and less intrusive than the patch proposed for the register coalescer [1], so +// that we can enable SRLT for AArch64. +// +// The approach here is to just add the 'implicit-def' manually after rewriting +// virtual regs -> phsyical regs. This still means that during the register +// allocation process the dependences are not accurately represented in the MIR +// and LiveIntervals, but there are several reasons why we believe this isn't a +// problem in practice: +// (A) The register allocator only spills entire virtual registers. +// This is additionally guarded by code in +// AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot +// where it checks if a register matches the expected register class. +// (B) Rematerialization only happens when the instruction writes the full +// register. +// (C) The high bits of the AArch64 register cannot be written independently. +// (D) Instructions that write only part of a register always take that same +// register as a tied input operand, to indicate it's a merging operation. +// +// (A) means that for two virtual registers of regclass GPR32 and GPR64, if the +// GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would +// be spilled/filled even if only the low 32-bits would be required for the +// given liverange. (B) means that the top bits of a GPR64 would never be +// overwritten by rematerialising a GPR32 sub-register for a given liverange. +// (C-D) means that we can assume that the MIR as input to the register +// allocator correctly expresses the instruction behaviour and dependences +// between values, so unless the register allocator would violate (A) or (B), +// the MIR is otherwise sound. +// +// Alternative approaches have also been considered, such as: +// (1) Changing the AArch64 instruction definitions to write all bits and +// extract the low N bits for the result. +// (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell +// the register allocator to favour the same register for the input/output. +// (3) Adding a new coalescer guard node with a tied-operand constraint, such +// that when the SUBREG_TO_REG is removed, something still represents that +// the top bits are defined. The node would get removed before rewriting +// virtregs. +// (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize +// away the INSERT_SUBREG (this is a more explicit variant of (2) and (3)) +// (5) Adding a new MachineOperand flag that represents the top bits would be +// defined, but are not read nor undef. +// +// (1) would be the best approach but would be a significant effort as it +// requires rewriting most/all instruction definitions and fixing MIR passes +// that rely on the current definitions, whereas (2-4) result in sub-optimal +// code that can't really be avoided because the explicit nodes would stop +// rematerialization. (5) might be a way to mitigate the +// fragility of implicit-def's of virtual registers if we want to pursue +// landing [1], but then we'd rather choose approach (1) to avoid using +// SUBREG_TO_REG entirely. +// +// [1] https://github.com/llvm/llvm-project/pull/168353 +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-srlt-define-superregs" +#define PASS_NAME "AArch64 SRLT Define Super-Regs Pass" + +namespace { + +struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass { + inline static char ID = 0; + + AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void collectWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits, + const BitVector &QHiRegUnits, + SmallSet<Register, 8> &SuperRegs); + + StringRef getPassName() const override { return PASS_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + MachineFunction *MF = nullptr; + const AArch64Subtarget *Subtarget = nullptr; + const AArch64RegisterInfo *TRI = nullptr; + const AArch64FunctionInfo *AFI = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; +}; + +} // end anonymous namespace + +INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, + false) + +SmallVector<MCRegister> SupportedAliasRegs = { + AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, + AArch64::X5, AArch64::X6, AArch64::X7, AArch64::X8, AArch64::X9, + AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14, + AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19, + AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24, + AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP, + AArch64::LR, AArch64::SP, AArch64::Z0, AArch64::Z1, AArch64::Z2, + AArch64::Z3, AArch64::Z4, AArch64::Z5, AArch64::Z6, AArch64::Z7, + AArch64::Z8, AArch64::Z9, AArch64::Z10, AArch64::Z11, AArch64::Z12, + AArch64::Z13, AArch64::Z14, AArch64::Z15, AArch64::Z16, AArch64::Z17, + AArch64::Z18, AArch64::Z19, AArch64::Z20, AArch64::Z21, AArch64::Z22, + AArch64::Z23, AArch64::Z24, AArch64::Z25, AArch64::Z26, AArch64::Z27, + AArch64::Z28, AArch64::Z29, AArch64::Z30, AArch64::Z31}; + +SmallVector<MCRegister> QHiRegs = { + AArch64::Q0_HI, AArch64::Q1_HI, AArch64::Q2_HI, AArch64::Q3_HI, + AArch64::Q4_HI, AArch64::Q5_HI, AArch64::Q6_HI, AArch64::Q7_HI, + AArch64::Q8_HI, AArch64::Q9_HI, AArch64::Q10_HI, AArch64::Q11_HI, + AArch64::Q12_HI, AArch64::Q13_HI, AArch64::Q14_HI, AArch64::Q15_HI, + AArch64::Q16_HI, AArch64::Q17_HI, AArch64::Q18_HI, AArch64::Q19_HI, + AArch64::Q20_HI, AArch64::Q21_HI, AArch64::Q22_HI, AArch64::Q23_HI, + AArch64::Q24_HI, AArch64::Q25_HI, AArch64::Q26_HI, AArch64::Q27_HI, + AArch64::Q28_HI, AArch64::Q29_HI, AArch64::Q30_HI, AArch64::Q31_HI}; + +// Find the widest super-reg for a given reg and adds it to `SuperRegs`. +// For example: +// W0 -> X0 +// B1 -> Q1 (without SVE) +// -> Z1 (with SVE) +// W1_W2 -> X1_X2 +// D0_D1 -> Q0_Q1 (without SVE) +// -> Z0_Z1 (with SVE) +void AArch64SRLTDefineSuperRegs::collectWidestSuperReg( + Register R, const BitVector &RequiredBaseRegUnits, + const BitVector &QHiRegUnits, SmallSet<Register, 8> &SuperRegs) { + assert(R.isPhysical() && + "Expected to be run straight after virtregrewriter!"); + + BitVector Units(TRI->getNumRegUnits()); + for (MCRegUnit U : TRI->regunits(R)) + Units.set((unsigned)U); + + auto IsSuitableSuperReg = [&](Register SR) { + for (MCRegUnit U : TRI->regunits(SR)) { + // Avoid choosing z1 as super-reg of d1 if SVE is not available. ---------------- MacDue wrote:
nit: Maybe note `Q*_HI` are only set for SVE vectors (I associate `Q` with Neon) https://github.com/llvm/llvm-project/pull/174188 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
