llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlgo
Author: Wei Xiao (williamweixiao)
<details>
<summary>Changes</summary>
Replace getCSRCost() and the global regalloc-csr-cost-scale cl::opt with
two target-overridable methods: getCSRFirstUseCost(MF) and
getCSRCostScale(MF). This makes CSR cost configuration target-specific
and simplifies initializeCSRCost() to a single formula:
CSRCost = BaseCost * EntryFreq * Scale / 100
Target changes (new cost model):
- AArch64: getCSRFirstUseCost()=2 (save+restore), getCSRCostScale()=80.
Previously used the legacy getCSRCost()=5 path which produced an
effectively negligible CSRCost=5. The new model produces a meaningful
cost that properly weighs CSR usage against spilling.
- RISCV: getCSRFirstUseCost()=2 (save+restore), getCSRCostScale()=80.
Same change as AArch64.
- AMDGPU: getCSRFirstUseCost()=100. Previously used the legacy
getCSRCost()=100 path which produced CSRCost=100. The new model
produces CSRCost=100*EntryFreq, which more aggressively discourages
CSR usage as intended ("stack access is very expensive").
Newly enabled:
- X86: getCSRFirstUseCost()=2 (push+pop), 0 when PPX is available
X86 SPEC CPU2017 rate performance (Intel Xeon Platinum 8280).
Exp = with this patch, Ref = without this patch (baseline).
Higher rate score is better. Single benchmark changes within 2% are
considered deviation and fluctuation:
```
Benchmark Exp(rat) Ref(rat) Change
500.perlbench_r 205.40 204.82 +0.29%
502.gcc_r 232.12 231.99 +0.06%
505.mcf_r 148.90 147.42 +1.00%
508.namd_r 207.94 210.54 -1.23%
510.parest_r 123.19 123.38 -0.15%
511.povray_r 287.00 272.70 +5.24%
520.omnetpp_r 159.85 160.85 -0.62%
523.xalancbmk_r 210.55 208.41 +1.02%
525.x264_r 514.22 518.01 -0.73%
526.blender_r 247.95 251.25 -1.31%
531.deepsjeng_r 221.74 220.70 +0.47%
538.imagick_r 376.90 366.98 +2.70%
541.leela_r 227.98 224.80 +1.42%
544.nab_r 418.96 415.92 +0.73%
557.xz_r 167.58 152.60 +9.81%
Geometric mean: +1.21%
```
---
Patch is 12.41 MiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/202007.diff
152 Files Affected:
- (modified) clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c (+1-3)
- (modified) clang/test/Frontend/stack-layout-remark.c (+3-3)
- (modified) llvm/include/llvm/CodeGen/TargetRegisterInfo.h (+14-4)
- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+15-41)
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.h (+5-8)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+3-1)
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.h (+6-5)
- (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+9)
- (modified) llvm/lib/Target/X86/X86RegisterInfo.h (+2)
- (modified) llvm/test/CodeGen/AArch64/cgp-usubo.ll (+11-10)
- (modified) llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll (+17-20)
- (modified) llvm/test/CodeGen/AArch64/pr51516.mir (+4-4)
- (modified) llvm/test/CodeGen/AArch64/ragreedy-csr2.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
(+12-10)
- (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+21-14)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll (+17-15)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
(+19-15)
- (modified) llvm/test/CodeGen/AArch64/spill-reload-remarks.ll (+100-1)
- (modified) llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll (+12-11)
- (modified) llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll
(+252-315)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+54056-53059)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+81-84)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+392-381)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+6769-6769)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+189-192)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+1486-1248)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+2683-2141)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+3459-2838)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+4676-4194)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+6841-6526)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+8160-7469)
- (modified) llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir (+16-22)
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll (+78-82)
- (modified) llvm/test/CodeGen/AMDGPU/function-resource-usage.ll (+1028-461)
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
(+101-143)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+148-151)
- (modified) llvm/test/CodeGen/AMDGPU/issue176578.ll (+58-60)
- (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
(+110-137)
- (modified) llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
(+14-17)
- (modified) llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask-phi-extend.ll
(+431-418)
- (modified) llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
(+272-302)
- (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
(+216-218)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll (+7-8)
- (modified) llvm/test/CodeGen/MLRegAlloc/interactive-mode.ll (+3-6)
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll (+21-23)
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll (+25-22)
- (modified) llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll (+4-4)
- (modified) llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll (+4-4)
- (modified) llvm/test/CodeGen/RISCV/bfloat-convert.ll (+31-28)
- (modified) llvm/test/CodeGen/RISCV/bitint-fp-conv-200.ll (+396-401)
- (modified) llvm/test/CodeGen/RISCV/condops.ll (+32-35)
- (modified) llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll (+26-25)
- (modified) llvm/test/CodeGen/RISCV/double-convert.ll (+39-38)
- (modified) llvm/test/CodeGen/RISCV/double-round-conv-sat.ll (+66-60)
- (modified) llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll (+4-8)
- (modified) llvm/test/CodeGen/RISCV/exception-pointer-register.ll (+18-30)
- (modified) llvm/test/CodeGen/RISCV/float-convert.ll (+34-33)
- (modified) llvm/test/CodeGen/RISCV/float-round-conv-sat.ll (+174-162)
- (modified) llvm/test/CodeGen/RISCV/fp128.ll (+114-114)
- (modified) llvm/test/CodeGen/RISCV/fpclamptosat.ll (+1416-1354)
- (modified) llvm/test/CodeGen/RISCV/fpenv.ll (+14-10)
- (modified) llvm/test/CodeGen/RISCV/half-convert.ll (+151-140)
- (modified) llvm/test/CodeGen/RISCV/half-round-conv-sat.ll (+240-216)
- (modified)
llvm/test/CodeGen/RISCV/machine-outliner-and-machine-copy-propagation.ll (+7-3)
- (modified) llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll (+62-63)
- (modified) llvm/test/CodeGen/RISCV/overflow-intrinsics.ll (+43-65)
- (modified) llvm/test/CodeGen/RISCV/rv32xtheadbb.ll (+26-25)
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+26-25)
- (modified) llvm/test/CodeGen/RISCV/rv64-double-convert.ll (+47-46)
- (modified) llvm/test/CodeGen/RISCV/rv64-float-convert.ll (+47-46)
- (modified) llvm/test/CodeGen/RISCV/rv64-half-convert.ll (+47-46)
- (modified) llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll (+12-12)
- (modified) llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll (+585-585)
- (modified) llvm/test/CodeGen/RISCV/rvv/pr95865.ll (+33-33)
- (modified) llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll (+28-32)
- (modified) llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll (+33-32)
- (modified) llvm/test/CodeGen/RISCV/select-cc.ll (+24-27)
- (modified) llvm/test/CodeGen/RISCV/shrinkwrap.ll (+61-52)
- (modified) llvm/test/CodeGen/RISCV/simplify-condbr.ll (+4-6)
- (modified) llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll (+20-17)
- (modified) llvm/test/CodeGen/X86/2007-02-16-BranchFold.ll (+11-10)
- (modified) llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll (+26-31)
- (modified) llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll (+18-18)
- (modified) llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll (+39-31)
- (modified) llvm/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll (+4-1)
- (modified) llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll (+73-75)
- (modified) llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll (+36-31)
- (modified) llvm/test/CodeGen/X86/PR40322.ll (+13-15)
- (modified) llvm/test/CodeGen/X86/andnot-patterns.ll (+24-22)
- (modified) llvm/test/CodeGen/X86/andnot-sink-not.ll (+280-261)
- (modified) llvm/test/CodeGen/X86/apx/add.ll (+43-43)
- (modified) llvm/test/CodeGen/X86/apx/memfold-no-physreg.ll (+65-89)
- (modified) llvm/test/CodeGen/X86/apx/memfold-origVNI-crash.ll (+24-31)
- (modified) llvm/test/CodeGen/X86/apx/pr191368.ll (+40-41)
- (modified) llvm/test/CodeGen/X86/apx/push2-pop2.ll (+24-24)
- (modified) llvm/test/CodeGen/X86/atom-fixup-lea2.ll (+49-5)
- (modified) llvm/test/CodeGen/X86/atomic-bit-test.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/atomic-rm-bit-test.ll (+169-150)
- (modified) llvm/test/CodeGen/X86/avoid-sfb.ll (+62-110)
- (modified) llvm/test/CodeGen/X86/block-placement.ll (+1719-227)
- (modified) llvm/test/CodeGen/X86/bmi.ll (+146-120)
- (modified) llvm/test/CodeGen/X86/bsf.ll (+46-44)
- (modified) llvm/test/CodeGen/X86/bt-merge-fuse.ll (+31-25)
- (modified) llvm/test/CodeGen/X86/btc_bts_btr.ll (+19-19)
- (modified) llvm/test/CodeGen/X86/bypass-slow-division-32.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll (+5-5)
- (modified) llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll (+3-2)
- (modified) llvm/test/CodeGen/X86/cgp-usubo.ll (+21-24)
- (modified) llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll (+75-59)
- (modified) llvm/test/CodeGen/X86/csr-split.ll (+33-37)
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
(+233-235)
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
(+201-199)
- (modified) llvm/test/CodeGen/X86/extract-bits.ll (+565-478)
- (modified) llvm/test/CodeGen/X86/extract-lowbits.ll (+68-68)
- (modified) llvm/test/CodeGen/X86/fp128-cast.ll (+11-11)
- (modified) llvm/test/CodeGen/X86/fptosi-sat-scalar.ll (+246-234)
- (modified) llvm/test/CodeGen/X86/fptoui-sat-scalar.ll (+171-163)
- (modified) llvm/test/CodeGen/X86/fshl.ll (+44-42)
- (modified) llvm/test/CodeGen/X86/funnel-shift.ll (+18-18)
- (modified) llvm/test/CodeGen/X86/i128-udiv.ll (+178-171)
- (modified) llvm/test/CodeGen/X86/indirect-branch-tracking-eh.ll (+161-61)
- (modified)
llvm/test/CodeGen/X86/inline-spiller-impdef-on-implicit-def-regression.ll
(+38-43)
- (modified) llvm/test/CodeGen/X86/lrshrink.ll (+7-6)
- (modified) llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll (+6-2192)
- (modified) llvm/test/CodeGen/X86/memcmp-optsize-x32.ll (+4-400)
- (modified) llvm/test/CodeGen/X86/memcmp-pgso-x32.ll (+4-400)
- (modified) llvm/test/CodeGen/X86/memcmp-x32.ll (+6-1678)
- (modified) llvm/test/CodeGen/X86/midpoint-int.ll (+90-80)
- (modified) llvm/test/CodeGen/X86/mul-constant-result.ll (+19-19)
- (modified) llvm/test/CodeGen/X86/no-split-size.ll (+29-38)
- (modified) llvm/test/CodeGen/X86/optimize-max-0.ll (+171-178)
- (modified) llvm/test/CodeGen/X86/peep-test-4.ll (+14-14)
- (modified) llvm/test/CodeGen/X86/probe-stack-eflags.ll (+11-12)
- (modified) llvm/test/CodeGen/X86/ragreedy-bug.ll (+233-26)
- (modified) llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll (+36-36)
- (modified) llvm/test/CodeGen/X86/sjlj-eh.ll (+187-60)
- (modified) llvm/test/CodeGen/X86/speculative-load-hardening.ll (+71-80)
- (modified) llvm/test/CodeGen/X86/split-reg-with-hint.ll (+17-38)
- (modified) llvm/test/CodeGen/X86/statepoint-invoke.ll (+30-33)
- (modified) llvm/test/CodeGen/X86/statepoint-vreg-details.ll (+10-187)
- (modified) llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll (+22-19)
- (modified) llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll (+77-75)
- (modified) llvm/test/CodeGen/X86/tail-opts.ll (+33-30)
- (modified) llvm/test/CodeGen/X86/tailcall-cgp-dup.ll (+13-11)
- (modified) llvm/test/CodeGen/X86/tbm_patterns.ll (+54-42)
- (modified) llvm/test/CodeGen/X86/x86-shrink-wrapping.ll (+12-26)
- (modified) llvm/test/CodeGen/X86/xmulo.ll (+6-8)
- (modified) llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll (+13-12)
- (modified) llvm/test/DebugInfo/KeyInstructions/X86/dwarf-ranks-blocks.ll
(+15-14)
- (modified) llvm/test/DebugInfo/RISCV/dw_op_entry_value_32bit.ll (+1-1)
- (modified) llvm/test/DebugInfo/RISCV/dw_op_entry_value_64bit.ll (+1-1)
- (modified) llvm/test/tools/llvm-locstats/locstats.ll (+5-5)
``````````diff
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
b/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
index ff3a1a47288a6..49c01d80dc85c 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
@@ -95,10 +95,8 @@ void hp5_phi_ptr_mixed(int x) NO_TAIL {
// CHECK: test ecx, ecx
// CHECK: mov rsi, qword ptr [rip + __ref_g_has_pointers]
// CHECK: call do_side_effects
-// CHECK: jmp
// CHECK: call do_other_side_effects
-// CHECK: lea rsi, [rip + g_this_is_const]
-// CHECK: mov rcx, rsi
+// CHECK: lea rcx, [rip + g_this_is_const]
// CHECK: call take_data
// CHECK: .seh_endproc
diff --git a/clang/test/Frontend/stack-layout-remark.c
b/clang/test/Frontend/stack-layout-remark.c
index b0ed03c80f24a..94e1f8a827777 100644
--- a/clang/test/Frontend/stack-layout-remark.c
+++ b/clang/test/Frontend/stack-layout-remark.c
@@ -152,9 +152,9 @@ extern void use_dot_vector(struct Array *data);
// O3-DEBUG: Function: do_work
// O3-DEBUG-NEXT: Offset: [SP-8], Type: Spill, Align: 16, Size: 8
// O3-DEBUG-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-// O3-DEBUG-NEXT: Offset: [SP-24], Type: Spill, Align: 16, Size: 8
-// O3-DEBUG-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
-// O3-DEBUG-NEXT: Offset: [SP-40], Type: Spill, Align: 16, Size: 8
+// O3-DEBUG: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+// O3-DEBUG: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+// O3-DEBUG: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
int do_work(struct Array *A, struct Array *B, struct Result *out) {
if (!A || !B)
return -1;
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 7c3c56552b82c..4db5c50c8bdae 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1032,10 +1032,20 @@ class LLVM_ABI TargetRegisterInfo : public
MCRegisterInfo {
/// Allow the target to override the cost of using a callee-saved register
for
/// the first time. Default value of 0 means we will use a callee-saved
- /// register if it is available.
- virtual unsigned getCSRFirstUseCost() const { return 0; }
- /// FIXME: We should deprecate this usage.
- virtual unsigned getCSRCost() const { return 0; }
+ /// register if it is available. The returned value is multiplied by the
entry
+ /// block frequency to produce the final CSR cost used by the greedy register
+ /// allocator. For example, a cost of 2 represents the cost of a push/pop
+ /// pair (2 memory accesses at entry frequency).
+ virtual unsigned getCSRFirstUseCost(const MachineFunction &MF) const {
+ return 0;
+ }
+
+ /// Allow the target to override the scale applied to the CSR first-use cost.
+ /// The scale is a percentage (e.g., 80 means 80% of the base cost).
+ /// Default value of 100 means no scaling.
+ virtual unsigned getCSRCostScale(const MachineFunction &MF) const {
+ return 100;
+ }
/// Returns true if the target requires (and can make use of) the register
/// scavenger.
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp
b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 6b8a9b8190f9a..634ba582e2f93 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -115,11 +115,6 @@ CSRFirstTimeCost("regalloc-csr-first-time-cost",
cl::desc("Cost for first time use of callee-saved register."),
cl::init(0), cl::Hidden);
-static cl::opt<unsigned> CSRCostScale(
- "regalloc-csr-cost-scale",
- cl::desc("Scale for the callee-saved register cost, in percentage."),
- cl::init(80), cl::Hidden);
-
static cl::opt<unsigned long> GrowRegionComplexityBudget(
"grow-region-complexity-budget",
cl::desc("growRegion() does not scale with the number of BB edges, so "
@@ -2418,43 +2413,22 @@ void RAGreedy::aboutToRemoveInterval(const LiveInterval
&LI) {
}
void RAGreedy::initializeCSRCost() {
- if (!CSRCostScale.getNumOccurrences() &&
- (CSRFirstTimeCost.getNumOccurrences() || TRI->getCSRCost())) {
- // We should deprecate the usage of CSRFirstTimeCost!
- // We use the command-line option if it is explicitly set, otherwise use
the
- // larger one out of the command-line option and the value reported by TRI.
- CSRCost = BlockFrequency(
- CSRFirstTimeCost.getNumOccurrences()
- ? CSRFirstTimeCost
- : std::max((unsigned)CSRFirstTimeCost, TRI->getCSRCost()));
- if (!CSRCost.getFrequency())
- return;
-
- // Raw cost is relative to Entry == 2^14; scale it appropriately.
- uint64_t ActualEntry = MBFI->getEntryFreq().getFrequency();
- if (!ActualEntry) {
- CSRCost = BlockFrequency(0);
- return;
- }
- uint64_t FixedEntry = 1 << 14;
- if (ActualEntry < FixedEntry) {
- CSRCost *= BranchProbability(ActualEntry, FixedEntry);
- } else if (ActualEntry <= UINT32_MAX) {
- // Invert the fraction and divide.
- CSRCost /= BranchProbability(FixedEntry, ActualEntry);
- } else {
- // Can't use BranchProbability in general, since it takes 32-bit numbers.
- CSRCost =
- BlockFrequency(CSRCost.getFrequency() * (ActualEntry / FixedEntry));
- }
- } else {
- uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
- CSRCost = BlockFrequency(TRI->getCSRFirstUseCost() * EntryFreq);
- if (CSRCostScale < 100)
- CSRCost *= BranchProbability(CSRCostScale, 100);
- else
- CSRCost /= BranchProbability(100, CSRCostScale);
+ unsigned BaseCost = CSRFirstTimeCost.getNumOccurrences()
+ ? CSRFirstTimeCost
+ : TRI->getCSRFirstUseCost(*MF);
+ if (!BaseCost) {
+ CSRCost = BlockFrequency(0);
+ return;
}
+
+ uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
+ CSRCost = BlockFrequency(BaseCost * EntryFreq);
+
+ unsigned Scale = TRI->getCSRCostScale(*MF);
+ if (Scale < 100)
+ CSRCost *= BranchProbability(Scale, 100);
+ else if (Scale > 100)
+ CSRCost /= BranchProbability(100, Scale);
}
/// Collect the hint info for \p Reg.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index ac58d8d6b1cc7..b307df9cc932b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -53,16 +53,13 @@ class AArch64RegisterInfo final : public
AArch64GenRegisterInfo {
const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const;
- unsigned getCSRCost() const override {
- // The cost will be compared against BlockFrequency where entry has the
- // value of 1 << 14. A value of 5 will choose to spill or split really
- // cold path instead of using a callee-saved register.
- return 5;
- }
- unsigned getCSRFirstUseCost() const override {
- // The cost of 2 means push and pop for each CSR.
+ unsigned getCSRFirstUseCost(const MachineFunction &MF) const override {
+ // The cost of save and restore (e.g. STP/LDP) for each CSR.
return 2;
}
+ unsigned getCSRCostScale(const MachineFunction &MF) const override {
+ return 80;
+ }
const TargetRegisterClass *
getSubClassWithSubReg(const TargetRegisterClass *RC,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5e08e47ad4d83..239a8676c75ea 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -109,7 +109,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
// Stack access is very expensive. CSRs are also the high registers, and we
// want to minimize the number of used registers.
- unsigned getCSRCost() const override { return 100; }
+ unsigned getCSRFirstUseCost(const MachineFunction &MF) const override {
+ return 100;
+ }
// When building a block VGPR load, we only really transfer a subset of the
// registers in the block, based on a mask. Liveness analysis is not aware of
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3a77820d28bbd..4fabdb9d2ce3c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -68,11 +68,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
- unsigned getCSRCost() const override {
- // The cost will be compared against BlockFrequency where entry has the
- // value of 1 << 14. A value of 5 will choose to spill or split cold
- // path instead of using a callee-saved register.
- return 5;
+ unsigned getCSRFirstUseCost(const MachineFunction &MF) const override {
+ // The cost of save and restore (e.g. sd/ld) for each CSR.
+ return 2;
+ }
+ unsigned getCSRCostScale(const MachineFunction &MF) const override {
+ return 80;
}
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const
override;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp
b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index c84e0f441a459..5101452a6f78a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1281,3 +1281,12 @@ bool X86RegisterInfo::isNonRex2RegClass(const
TargetRegisterClass *RC) const {
return true;
}
}
+
+unsigned X86RegisterInfo::getCSRFirstUseCost(const MachineFunction &MF) const {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ if (ST.is64Bit() && ST.hasPPX())
+ return 0;
+
+ // push + pop.
+ return 2;
+}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h
b/llvm/lib/Target/X86/X86RegisterInfo.h
index e646591663aca..1418e2892768a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -182,6 +182,8 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
bool requiresRegisterScavenging(const MachineFunction &MF) const override {
return true;
}
+
+ unsigned getCSRFirstUseCost(const MachineFunction &MF) const override;
};
} // End llvm namespace
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index f990920e2793a..35879b94e503b 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -280,29 +280,30 @@ end:
define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond)
nounwind {
; CHECK-LABEL: usubo_ult_cmp_dominates_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w19, w3
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: tbz w3, #0, .LBB15_3
; CHECK-NEXT: // %bb.1: // %t
; CHECK-NEXT: cmp x0, x1
-; CHECK-NEXT: mov x22, x0
-; CHECK-NEXT: mov x20, x2
-; CHECK-NEXT: cset w21, lo
-; CHECK-NEXT: mov x23, x1
-; CHECK-NEXT: mov w0, w21
+; CHECK-NEXT: mov x21, x0
+; CHECK-NEXT: str x2, [sp, #8] // 8-byte Spill
+; CHECK-NEXT: cset w20, lo
+; CHECK-NEXT: mov x22, x1
+; CHECK-NEXT: mov w0, w20
; CHECK-NEXT: bl call
-; CHECK-NEXT: subs x8, x22, x23
+; CHECK-NEXT: subs x8, x21, x22
; CHECK-NEXT: b.hs .LBB15_3
; CHECK-NEXT: // %bb.2: // %end
-; CHECK-NEXT: mov w19, w21
-; CHECK-NEXT: str x8, [x20]
+; CHECK-NEXT: ldr x9, [sp, #8] // 8-byte Reload
+; CHECK-NEXT: mov w19, w20
+; CHECK-NEXT: str x8, [x9]
; CHECK-NEXT: .LBB15_3: // %common.ret
; CHECK-NEXT: and w0, w19, #0x1
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
br i1 %cond, label %t, label %f
diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index 09e80ee936738..3bb09723381fa 100644
--- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -582,26 +582,30 @@ declare void @do_something() #1
define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: adrp x19, :got:a
-; CHECK-NEXT: ldr x19, [x19, :got_lo12:a]
-; CHECK-NEXT: ldr w8, [x19]
+; CHECK-NEXT: adrp x8, :got:a
+; CHECK-NEXT: ldr x8, [x8, :got_lo12:a]
+; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: cmn w8, #2
; CHECK-NEXT: b.gt .LBB10_4
; CHECK-NEXT: // %bb.1: // %while.body.preheader
-; CHECK-NEXT: sub w20, w8, #1
+; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: sub w19, w8, #1
; CHECK-NEXT: .LBB10_2: // %while.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: bl do_something
-; CHECK-NEXT: adds w20, w20, #1
+; CHECK-NEXT: adds w19, w19, #1
; CHECK-NEXT: b.mi .LBB10_2
; CHECK-NEXT: // %bb.3: // %while.cond.while.end_crit_edge
-; CHECK-NEXT: ldr w8, [x19]
+; CHECK-NEXT: adrp x8, :got:a
+; CHECK-NEXT: ldr x8, [x8, :got_lo12:a]
+; CHECK-NEXT: ldr w8, [x8]
+; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: .LBB10_4: // %while.end
; CHECK-NEXT: cmp w8, #1
; CHECK-NEXT: b.gt .LBB10_7
@@ -616,16 +620,9 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ()
#0 {
; CHECK-NEXT: b.ne .LBB10_7
; CHECK-NEXT: // %bb.6:
; CHECK-NEXT: mov w0, #123 // =0x7b
-; CHECK-NEXT: b .LBB10_8
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB10_7: // %if.end
; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: .LBB10_8: // %return
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w19
-; CHECK-NEXT: .cfi_restore w20
-; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: ret
entry:
%0 = load i32, ptr @a, align 4
diff --git a/llvm/test/CodeGen/AArch64/pr51516.mir
b/llvm/test/CodeGen/AArch64/pr51516.mir
index ae54ad0d5cef4..854de23f3d426 100644
--- a/llvm/test/CodeGen/AArch64/pr51516.mir
+++ b/llvm/test/CodeGen/AArch64/pr51516.mir
@@ -5,10 +5,10 @@
# of ADDXri is killed by the STRXui in this block.
# CHECK-LABEL: name: test
-# CHECK: bb.17:
-# CHECK: STRXui
-# CHECK: LDRXui
-# CHECK: bb.18:
+# CHECK: bb.9:
+# CHECK: ADDXri
+# CHECK: STRXui %{{[0-9]+}}, %stack.1
+# CHECK: bb.10:
---
name: test
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
b/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
index 2d5f5dbbf8f07..2ef7d1bacd288 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 6
-; RUN: llc < %s -regalloc-csr-cost-scale=80 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target triple = "aarch64"
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index b9a542b330c0f..03ebba5a3e308 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -42,18 +42,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; NOPAIR-NEXT: addvl sp, sp, #-1
; NOPAIR-NEXT: str z0, [sp] // 16-byte Folded Spill
-; NOPAIR-NEXT: mrs x19, SVCR
-; NOPAIR-NEXT: tbz w19, #0, .LBB0_2
+; NOPAIR-NEXT: mrs x8, SVCR
+; NOPAIR-NEXT: tbz w8, #0, .LBB0_2
; NOPAIR-NEXT: // %bb.1:
; NOPAIR-NEXT: smstop sm
; NOPAIR-NEXT: .LBB0_2:
-; NOPAIR-NEXT: rdvl x8, #1
-; NOPAIR-NEXT: addsvl x8, x8, #-1
-; NOPAIR-NEXT: cbz x8, .LBB0_4
+; NOPAIR-NEXT: rdvl x9, #1
+; NOPAIR-NEXT: addsvl x9, x9, #-1
+; NOPAIR-NEXT: cbz x9, .LBB0_4
; NOPAIR-NEXT: // %bb.3:
; NOPAIR-NEXT: brk #0x1
; NOPAIR-NEXT: .LBB0_4:
; NOPAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT: mov x19, x8
; NOPAIR-NEXT: bl my_func2
; NOPAIR-NEXT: tbz w19, #0, .LBB0_6
; NOPAIR-NEXT: // %bb.5:
@@ -128,18 +129,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: addvl sp, sp, #-1
; PAIR-NEXT: str z0, [sp] // 16-byte Folded Spill
-; PAIR-NEXT: mrs x19, SVCR
-; PAIR-NEXT: tbz w19, #0, .LBB0_2
+; PAIR-NEXT: mrs x8, SVCR
+; PAIR-NEXT: tbz w8, #0, .LBB0_2
; PAIR-NEXT: // %bb.1:
; PAIR-NEXT: smstop sm
; PAIR-NEXT: .LBB0_2:
-; PAIR-NEXT: rdvl x8, #1
-; PAIR-NEXT: addsvl x8, x8, #-1
-; PAIR-NEXT: cbz x8, .LBB0_4
+; PAIR-NEXT: rdvl x9, #1
+; PAIR-NEXT: addsvl x9, x9, #-1
+; PAIR-NEXT: cbz x9, .LBB0_4
; PAIR-NEXT: // %bb.3:
; PAIR-NEXT: brk #0x1
; PAIR-NEXT: .LBB0_4:
; PAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; PAIR-NEXT: mov x19, x8
; PAIR-NEXT: bl my_func2
; PAIR-NEXT: tbz w19, #0, .LBB0_6
; PAIR-NEXT: // %bb.5:
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 36539d94338a0..850aa7a63e016 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -514,24 +514,27 @@ define void @test12() "aarch64_pstate_sm_body" {
define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
; CHECK-LABEL: test13:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: mov z0.s, #0 // =0x0
-; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: addsvl x8, x8, #-1
; CHECK-NEXT: cbnz x8, .LBB14_2
; CHECK-NEXT: // %bb.1:
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: str x0, [sp, #8] // 8-byte Spill
+; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: bl callee_farg_fret
-; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
@@ -540,19 +543,23 @@ define void @test13(ptr %ptr) nounwind
"aarch64_pstate_sm_enabled" {
; CHECK-NEXT: .LBB14_2:
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB14_3:
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: bl callee_farg_fret
-; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-by...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/202007
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits