llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-aarch64 Author: None (llvmbot) <details> <summary>Changes</summary> Backport 9fc7c429752ed87a36f383ee47bad575fea7702a 0133247567a2e69e107bcdd4b1d72fe93b7f93f9 91f5d73b311f3622517ff1d34d21cc8ef1f52ea9 Requested by: @<!-- -->sdesmalen-arm --- Patch is 1020.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176197.diff 120 Files Affected: - (modified) llvm/lib/Target/AArch64/AArch64.h (+2) - (modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+5-5) - (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+2-3) - (added) llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp (+248) - (modified) llvm/lib/Target/AArch64/AArch64Subtarget.cpp (+16-3) - (modified) llvm/lib/Target/AArch64/AArch64Subtarget.h (+7-1) - (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+15-1) - (modified) llvm/lib/Target/AArch64/CMakeLists.txt (+1) - (modified) llvm/test/CodeGen/AArch64/O3-pipeline.ll (+1) - (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+2-15) - (modified) llvm/test/CodeGen/AArch64/arm64-addrmode.ll (+40-90) - (modified) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+3-9) - (modified) llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll (+3-9) - (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+11-53) - (modified) llvm/test/CodeGen/AArch64/ldst-implicitop.mir (+29) - (modified) llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll (+5-10) - (modified) llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir (-1) - (modified) llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll (-3) - (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll (-1) - (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll (-1) - (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+10-40) - (modified) llvm/test/CodeGen/AArch64/sme-streaming-body.ll (-2) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll (-24) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfmul.ll (-18) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll (-18) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll (-4) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll (+78-78) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fclamp.ll (-18) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll (+2-98) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll (-54) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll (+2-2) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll (+2-6) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll (+182-218) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll (+182-218) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll (+92-152) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll (+1-192) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll (-8) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll (-64) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll (-82) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll (+104-104) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sclamp.ll (-24) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll (-2) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll (+52-52) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll (-60) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll (-26) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-uclamp.ll (-24) - (modified) llvm/test/CodeGen/AArch64/sme2p2-intrinsics-fmul.ll (-54) - (added) llvm/test/CodeGen/AArch64/subreg-liveness-fix-subreg-to-reg-implicit-def.mir (+107) - (modified) llvm/test/CodeGen/AArch64/subreg_to_reg_coalescing_issue.mir (+1-2) - (modified) llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll (+2-4) - (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll (-38) - (modified) llvm/test/CodeGen/AArch64/sve-fmsub.ll (+2-50) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll (+4-4) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll (+1-9) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll (+2-5) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll (+2-26) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll (+2-26) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll (+2-121) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll (+2-65) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll (+2-83) - (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll (+2-2) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll (+4-16) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll (+3-13) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll (+3-13) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll (+3-37) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll (+9-69) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll (+3-5) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll (+3-23) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+3-39) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+3-20) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll (+3-29) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll (+3-18) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll (+10-10) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll (+3-135) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll (+3-22) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll (+7-14) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll (+3-63) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll (+3-87) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll (+3-23) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+8-32) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll (+4-144) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll (+3-27) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll (+81-141) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll (+15-39) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll (+3-75) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll (+3-99) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mul.ll (+3-6) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll (+6-126) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll (+3-38) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll (+78-132) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll (+3-36) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll (+7-97) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+12-52) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+7-7) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll (+3-5) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll (+3-5) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll (+3-30) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+3-4) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll (+3-3) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll (+3-4) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll (+26-26) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll (+12-16) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll (+2-3) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll (+3-37) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll (+3-23) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll (+8-12) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll (+4-4) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll (+355-365) - (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll (+6-33) - (modified) llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll (+30-61) - (modified) llvm/test/CodeGen/AArch64/sve-vector-interleave.ll (+5-64) - (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll (+6-6) - (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll (+3-3) - (modified) llvm/test/CodeGen/AArch64/sve2p1-dots-partial-reduction.ll (+2-10) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-crypto.ll (+2-28) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll (+4-157) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll (+56-56) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll (+51-51) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll (+10-10) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll (+4-36) ``````````diff diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index a8e15c338352a..40983714ddf1d 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -64,6 +64,7 @@ FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); FunctionPass *createSMEPeepholeOptPass(); FunctionPass *createMachineSMEABIPass(CodeGenOptLevel); +FunctionPass *createAArch64SRLTDefineSuperRegsPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, @@ -117,6 +118,7 @@ void initializeLDTLSCleanupPass(PassRegistry&); void initializeSMEABIPass(PassRegistry &); void initializeSMEPeepholeOptPass(PassRegistry &); void initializeMachineSMEABIPass(PassRegistry &); +void initializeAArch64SRLTDefineSuperRegsPass(PassRegistry &); void initializeSVEIntrinsicOptsPass(PassRegistry &); void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 45599de6a4828..3d9444c0c5426 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -833,10 +833,10 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) { } } -static bool isRewritableImplicitDef(unsigned Opc) { - switch (Opc) { +static bool isRewritableImplicitDef(const MachineOperand &MO) { + switch (MO.getParent()->getOpcode()) { default: - return false; + return MO.isRenamable(); case AArch64::ORRWrs: case AArch64::ADDWri: return true; @@ -1047,7 +1047,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MI.getRegClassConstraint(OpIdx, TII, TRI)) MatchingReg = GetMatchingSubReg(RC); else { - if (!isRewritableImplicitDef(MI.getOpcode())) + if (!isRewritableImplicitDef(MOP)) continue; MatchingReg = GetMatchingSubReg( TRI->getMinimalPhysRegClass(MOP.getReg())); @@ -1739,7 +1739,7 @@ static bool canRenameMOP(const MachineOperand &MOP, // them must be known. For example, in ORRWrs the implicit-def // corresponds to the result register. if (MOP.isImplicit() && MOP.isDef()) { - if (!isRewritableImplicitDef(MOP.getParent()->getOpcode())) + if (!isRewritableImplicitDef(MOP)) return false; return TRI->isSuperOrSubRegisterEq( MOP.getParent()->getOperand(0).getReg(), MOP.getReg()); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 098fc4528c91e..8c0dd4381fae8 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1385,9 +1385,8 @@ bool AArch64RegisterInfo::shouldCoalesce( MachineFunction &MF = *MI->getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); - // Coalescing of SUBREG_TO_REG is broken when using subreg liveness tracking, - // we must disable it for now. - if (MI->isSubregToReg() && MRI.subRegLivenessEnabled()) + if (MI->isSubregToReg() && MRI.subRegLivenessEnabled() && + !MF.getSubtarget<AArch64Subtarget>().enableSRLTSubregToRegMitigation()) return false; if (MI->isCopy() && diff --git a/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp b/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp new file mode 100644 index 0000000000000..40345769a64d9 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp @@ -0,0 +1,248 @@ +//===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds +// extra implicit-def's to instructions that define the low N bits of +// a GPR/FPR register to also define the top bits, because all AArch64 +// instructions that write the low bits of a GPR/FPR also implicitly zero +// the top bits. For example, 'mov w0, w1' writes zeroes to the top 32-bits of +// x0, so this pass adds a `implicit-def $x0` after register allocation. +// +// These semantics are originally represented in the MIR using `SUBREG_TO_REG` +// which expresses that the top bits have been defined by the preceding +// instructions, but during register coalescing this information is lost and in +// contrast to when SRTL is disabled, when rewriting virtual -> physical +// registers the implicit-defs are not added to the instruction. +// +// There have been several attempts to fix this in the coalescer [1], but each +// iteration has exposed new bugs and the patch had to be reverted. +// Additionally, the concept of adding 'implicit-def' of a virtual register is +// particularly fragile and many places don't expect it (for example in +// `X86::commuteInstructionImpl` the code only looks at specific operands and +// does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where +// it traverses operand 'defs' rather than 'all_defs'). +// +// We want a temporary solution that doesn't impact other targets and is simpler +// and less intrusive than the patch proposed for the register coalescer [1], so +// that we can enable SRLT for AArch64. +// +// The approach here is to just add the 'implicit-def' manually after rewriting +// virtual regs -> phsyical regs. This still means that during the register +// allocation process the dependences are not accurately represented in the MIR +// and LiveIntervals, but there are several reasons why we believe this isn't a +// problem in practice: +// (A) The register allocator only spills entire virtual registers. +// This is additionally guarded by code in +// AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot +// where it checks if a register matches the expected register class. +// (B) Rematerialization only happens when the instruction writes the full +// register. +// (C) The high bits of the AArch64 register cannot be written independently. +// (D) Instructions that write only part of a register always take that same +// register as a tied input operand, to indicate it's a merging operation. +// +// (A) means that for two virtual registers of regclass GPR32 and GPR64, if the +// GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would +// be spilled/filled even if only the low 32-bits would be required for the +// given liverange. (B) means that the top bits of a GPR64 would never be +// overwritten by rematerialising a GPR32 sub-register for a given liverange. +// (C-D) means that we can assume that the MIR as input to the register +// allocator correctly expresses the instruction behaviour and dependences +// between values, so unless the register allocator would violate (A) or (B), +// the MIR is otherwise sound. +// +// Alternative approaches have also been considered, such as: +// (1) Changing the AArch64 instruction definitions to write all bits and +// extract the low N bits for the result. +// (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell +// the register allocator to favour the same register for the input/output. +// (3) Adding a new coalescer guard node with a tied-operand constraint, such +// that when the SUBREG_TO_REG is removed, something still represents that +// the top bits are defined. The node would get removed before rewriting +// virtregs. +// (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize +// away the INSERT_SUBREG (this is a more explicit variant of (2) and (3)) +// (5) Adding a new MachineOperand flag that represents the top bits would be +// defined, but are not read nor undef. +// +// (1) would be the best approach but would be a significant effort as it +// requires rewriting most/all instruction definitions and fixing MIR passes +// that rely on the current definitions, whereas (2-4) result in sub-optimal +// code that can't really be avoided because the explicit nodes would stop +// rematerialization. (5) might be a way to mitigate the +// fragility of implicit-def's of virtual registers if we want to pursue +// landing [1], but then we'd rather choose approach (1) to avoid using +// SUBREG_TO_REG entirely. +// +// [1] https://github.com/llvm/llvm-project/pull/168353 +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-srlt-define-superregs" +#define PASS_NAME "AArch64 SRLT Define Super-Regs Pass" + +namespace { + +struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass { + inline static char ID = 0; + + AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + Register getWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits, + const BitVector &QHiRegUnits); + + StringRef getPassName() const override { return PASS_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + MachineFunction *MF = nullptr; + const AArch64Subtarget *Subtarget = nullptr; + const AArch64RegisterInfo *TRI = nullptr; +}; + +} // end anonymous namespace + +INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, false) + +// Returns the widest super-reg for a given reg, or NoRegister if no suitable +// wider super-reg has been found. For example: +// W0 -> X0 +// B1 -> Q1 (without SVE) +// -> Z1 (with SVE) +// W1_W2 -> X1_X2 +// D0_D1 -> Q0_Q1 (without SVE) +// -> Z0_Z1 (with SVE) +Register AArch64SRLTDefineSuperRegs::getWidestSuperReg( + Register R, const BitVector &RequiredBaseRegUnits, + const BitVector &QHiRegUnits) { + assert(R.isPhysical() && + "Expected to be run straight after virtregrewriter!"); + + BitVector Units(TRI->getNumRegUnits()); + for (MCRegUnit U : TRI->regunits(R)) + Units.set((unsigned)U); + + auto IsSuitableSuperReg = [&](Register SR) { + for (MCRegUnit U : TRI->regunits(SR)) { + // Avoid choosing z1 as super-reg of d1 if SVE is not available. + // Q*_HI registers are only set for SVE registers, as those consist + // of the Q* register for the low 128 bits and the Q*_HI (artificial) + // register for the top (vscale-1) * 128 bits. + if (QHiRegUnits.test((unsigned)U) && + !Subtarget->isSVEorStreamingSVEAvailable()) + return false; + // We consider a super-reg as unsuitable if any of its reg units is not + // artificial and not shared, as that would imply that U is a unit for a + // different register, which means the candidate super-reg is likely + // a register tuple. + if (!TRI->isArtificialRegUnit(U) && + (!Units.test((unsigned)U) || !RequiredBaseRegUnits.test((unsigned)U))) + return false; + } + return true; + }; + + Register LargestSuperReg = AArch64::NoRegister; + for (Register SR : TRI->superregs(R)) + if (IsSuitableSuperReg(SR) && (LargestSuperReg == AArch64::NoRegister || + TRI->isSuperRegister(LargestSuperReg, SR))) + LargestSuperReg = SR; + + return LargestSuperReg; +} + +bool AArch64SRLTDefineSuperRegs::runOnMachineFunction(MachineFunction &MF) { + this->MF = &MF; + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); + TRI = Subtarget->getRegisterInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + + if (!MRI->subRegLivenessEnabled()) + return false; + + assert(!MRI->isSSA() && "Expected to be run after breaking down SSA form!"); + + auto XRegs = seq_inclusive<unsigned>(AArch64::X0, AArch64::X28); + auto ZRegs = seq_inclusive<unsigned>(AArch64::Z0, AArch64::Z31); + constexpr unsigned FixedRegs[] = {AArch64::FP, AArch64::LR, AArch64::SP}; + + BitVector RequiredBaseRegUnits(TRI->getNumRegUnits()); + for (Register R : concat<unsigned>(XRegs, ZRegs, FixedRegs)) + for (MCRegUnit U : TRI->regunits(R)) + RequiredBaseRegUnits.set((unsigned)U); + + BitVector QHiRegUnits(TRI->getNumRegUnits()); + for (Register R : seq_inclusive<unsigned>(AArch64::Q0_HI, AArch64::Q31_HI)) + for (MCRegUnit U : TRI->regunits(R)) + QHiRegUnits.set((unsigned)U); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // PATCHPOINT may have a 'def' that's not a register, avoid this. + if (MI.getOpcode() == TargetOpcode::PATCHPOINT) + continue; + // For each partial register write, also add an implicit-def for top bits + // of the register (e.g. for w0 add a def of x0). + SmallSet<Register, 8> SuperRegs; + for (const MachineOperand &DefOp : MI.defs()) + if (Register R = getWidestSuperReg(DefOp.getReg(), RequiredBaseRegUnits, + QHiRegUnits); + R != AArch64::NoRegister) + SuperRegs.insert(R); + + if (!SuperRegs.size()) + continue; + + LLVM_DEBUG(dbgs() << "Adding implicit-defs to: " << MI); + for (Register R : SuperRegs) { + LLVM_DEBUG(dbgs() << " " << printReg(R, TRI) << "\n"); + bool IsRenamable = any_of(MI.defs(), [&](const MachineOperand &MO) { + return MO.isRenamable() && TRI->regsOverlap(MO.getReg(), R); + }); + bool IsDead = any_of(MI.defs(), [&](const MachineOperand &MO) { + return MO.isDead() && TRI->regsOverlap(MO.getReg(), R); + }); + MachineOperand DefOp = MachineOperand::CreateReg( + R, /*isDef=*/true, /*isImp=*/true, /*isKill=*/false, + /*isDead=*/IsDead, /*isUndef=*/false, /*isEarlyClobber=*/false, + /*SubReg=*/0, /*isDebug=*/false, /*isInternalRead=*/false, + /*isRenamable=*/IsRenamable); + MI.addOperand(DefOp); + } + Changed = true; + } + } + + return Changed; +} + +FunctionPass *llvm::createAArch64SRLTDefineSuperRegsPass() { + return new AArch64SRLTDefineSuperRegs(); +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 92a7412e83fac..a642841243be3 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -355,7 +355,8 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, unsigned MinSVEVectorSizeInBitsOverride, unsigned MaxSVEVectorSizeInBitsOverride, bool IsStreaming, bool IsStreamingCompatible, - bool HasMinSize) + bool HasMinSize, + bool EnableSRLTSubregToRegMitigation) : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), @@ -367,7 +368,9 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, ? std::optional<unsigned>(AArch64StreamingHazardSize) : std::nullopt), MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), - MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), + MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), + EnableSRLTSubregToRegMitigation(EnableSRLTSubregToRegMitigation), + TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)), TLInfo(TM, *this) { if (AArch64::isX18ReservedByDefault(TT)) @@ -400,7 +403,17 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP")) ReserveXRegisterForRA.set(29); - EnableSubregLiveness = EnableSubregLivenessTracking.getValue(); + // To benefit from SME2's strided-register multi-vector load/store + // instructions we'll need to enable subreg liveness. Our longer + // term aim is to make this the default, regardless of streaming + // mode, but there are still some outstanding issues, see: + // https://github.com/llvm/llvm-project/pull/174188 + // and: + // https://github.com/llvm/llvm-project/pull/168353 + if (IsStreaming) + EnableSubregLiveness = true; + else + EnableSubregLiveness = EnableSubregLivenessTracking.getValue(); } const CallLowering *AArch64Subtarget::getCallLowering() const { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index bd8a2d5234f2d..248e140b3101c 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -88,6 +88,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { std::optional<unsigned> StreamingHazardSize; unsigned MinSVEVectorSizeInBits; unsigned MaxSVEVectorSizeInBits; + bool EnableSRLTSubregToRegMitigation; unsigned VScaleForTuning = 1; TailFoldingOpts DefaultSVETFOpts = TailFoldingOpts::Disabled; @@ -128,7 +129,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned MinSVEVectorSizeInBitsOverride = 0, unsigned MaxSVEVectorSizeInBitsOverride = 0, bool IsStreaming = false, bool IsStreamingCompatible = false, - bool HasMinSize = false); + bool HasMinSize = false, + bool EnableSRLTSubregToRegMitigation = false); // Getters for SubtargetFeatures defined in tablegen #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ @@ -467,6 +469,10 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { /// add + cnt instructions. bool useScalarIncVL() const; + bool enableSRLTSubregToRegMitigation() const { + return EnableSRLTSubregToRegMitigation; + } + /// Choose a method of checking LR before performing a tail call. AArch64PAuth::AuthCheckMethod getAuthenticatedLRCheckMethod(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 1ec5a20cc0ce0..3aba866458830 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -227,6 +227,12 @@ static cl::opt<bool> cl::desc("Enable new lowering for the SME ABI"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnableSRLTSubregToRegMitigation( + "aarch64-srlt-mitigate-sr2r", + cl::desc("Enable SUBREG_TO_REG mitigation by adding 'implicit-def' for " + "super-regs when using Subreg Liveness Tracking"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. @@ -268,6 +274,7 @@ LLVMInitializeAArch64Target() { initializeKCFIPass(PR); initializeSMEABIPass(PR); initializeMachineSMEABIPass(PR); + initializeAArch64SRLTDefineSuperRegsPass(PR); initializeSMEPeepholeOptPass(PR); initializeSVEIntrinsicOptsPass(PR); initializeAArch64SpeculationHardeningPass(PR); @@ -462,7 +469,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { resetTargetOptions(F); I = std::make_unique<AArch64Subtarget>( TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize, - MaxSVEVectorSize, IsStreaming, Is... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/176197 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
