https://gcc.gnu.org/g:5cb0842b8b86f8495a110018fb8af3c4c3316605
commit r17-1143-g5cb0842b8b86f8495a110018fb8af3c4c3316605 Author: Kyrylo Tkachov <[email protected]> Date: Wed May 27 13:47:48 2026 -0700 PR target/122827: aarch64: Treat SVE modes as full-Z for callee-save accounting PR122827 is a miscompile of 526.blender_r in SPEC2017 with -Ofast -flto=auto -mcpu=neoverse-v2 -msve-vector-bits=128 --param aarch64-autovec-preference=sve-only. Bisection landed on g:b191e8bdecf (the IRA cost-model rework for PR117477), and the miscompile disappears with -mearly-ra=none, -fno-caller-saves, or -fno-early-ra, all of which steer register allocation away from V8-V15. The bug is in shade_material_loop in shadeinput.c, where a broadcast of a float scalar to a VNx2SF value is held in V14 across a call: mov z14.s, s15 ; broadcast s15 -> all 4 .s lanes of z14 ... bl shade_lamp_loop ; clobbers V14[127:64] ... fmla z20.s, p7/m, z14.s, z21.s ; reads all 4 .s lanes of z14 Under AAPCS64 only the low 64 bits of V8-V15 (D8-D15) are callee-saved. The prologue dutifully saves d14/d15, but the broadcast and FMA both touch the full Z register, and the upper half of v14 gets corrupted by the call -- producing wrong floating-point results. VNx2SF has GET_MODE_SIZE == 8, so the previous part-clobber check treated V14 as fully preserved for that mode. The mode size is correct for the *data* (two 32-bit lanes), but the underlying SVE instructions are not packed -- the two .s lanes live at byte offsets 0-3 and 8-11 of the Z register (the .d-strided layout used for partial SVE modes), so byte 8 onwards is outside the AAPCS64-preserved range. Any SVE mode in V8-V15 across a call is therefore partially clobbered, regardless of GET_MODE_SIZE. The change in g:b191e8bdecf made the cost of allocating a callee-saved register low enough that IRA started picking V14 for these pseudos, exposing the latent miscompile. Two places need fixing: * aarch64_hard_regno_call_part_clobbered uses GET_MODE_SIZE to decide whether the callee-preserved low 64 bits are enough for the value. For SVE modes, compare against BYTES_PER_SVE_VECTOR instead: any SVE mode physically uses the full Z register. * early-RA classifies allocno groups into FPR_D/FPR_Q/FPR_Z by mode bit size, and partial_fpr_clobbers then maps FPR_D to V8QImode for the ABI check. That synthetic V8QImode hides the fact that the pseudo is actually an SVE mode, so V8-V15 are not excluded from the candidate set. Classify any SVE mode as FPR_Z so that partial_fpr_clobbers uses VNx16QImode and correctly removes V8-V15. A new test is added in gcc.target/aarch64/sve/pr122827.c Before the fix the execution test aborts (V14 is allocated to the broadcast across the call, and the explicit clobber inside the callee corrupts its upper lanes). after the fix it passes. Explicitly clobbering V14 with inline assembly in the test may be a bit fragile as it depends on RA picking V14 for the pre-call broadcast to trigger the bug but I couldn't find a good way of generalising it without perturbing RA in ways that made the bug not trigger. With this patch blender passes again and no regressions on aarch64-none-linux-gnu bootstrap. Signed-off-by: Kyrylo Tkachov <[email protected]> gcc/ChangeLog: PR target/122827 * config/aarch64/aarch64.cc (aarch64_hard_regno_call_part_clobbered): For SVE modes use BYTES_PER_SVE_VECTOR for the per-register size rather than GET_MODE_SIZE. * config/aarch64/aarch64-early-ra.cc (early_ra::get_allocno_subgroup): Classify any SVE mode as FPR_Z. gcc/testsuite/ChangeLog: PR target/122827 * gcc.target/aarch64/sve/pr122827.c: New test. Diff: --- gcc/config/aarch64/aarch64-early-ra.cc | 10 ++- gcc/config/aarch64/aarch64.cc | 21 ++++-- gcc/testsuite/gcc.target/aarch64/sve/pr122827.c | 96 +++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64-early-ra.cc b/gcc/config/aarch64/aarch64-early-ra.cc index 40a305130725..24f0e1bdffb8 100644 --- a/gcc/config/aarch64/aarch64-early-ra.cc +++ b/gcc/config/aarch64/aarch64-early-ra.cc @@ -1520,7 +1520,15 @@ early_ra::get_allocno_subgroup (rtx reg) group->has_flexible_stride = ((flags & HAS_FLEXIBLE_STRIDE) != 0 && (flags & HAS_FIXED_STRIDE) == 0); - group->fpr_size = (maybe_gt (fpr_bits, 128) ? FPR_Z + // SVE modes always occupy the full Z register, even for partial modes + // like VNx2SF whose mode size is only 64 bits. Treating such modes + // as FPR_D would let partial_fpr_clobbers ignore the fact that + // V8-V15 are only partially callee-saved under AAPCS64 -- the high + // bits of the Z register are clobbered by calls. Always classify + // SVE modes as FPR_Z so that V8-V15 are excluded as candidates for + // pseudos that are live across calls. + group->fpr_size = ((aarch64_sve_mode_p (GET_MODE (reg)) + || maybe_gt (fpr_bits, 128)) ? FPR_Z : maybe_gt (fpr_bits, 64) ? FPR_Q : FPR_D); entry = group; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index ae93cb6bb2cd..5a859e12b1a5 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -2939,10 +2939,23 @@ aarch64_hard_regno_call_part_clobbered (unsigned int abi_id, && abi_id != ARM_PCS_SVE && abi_id != ARM_PCS_PRESERVE_NONE) { - poly_int64 per_register_size = GET_MODE_SIZE (mode); - unsigned int nregs = hard_regno_nregs (regno, mode); - if (nregs > 1) - per_register_size = exact_div (per_register_size, nregs); + poly_int64 per_register_size; + if (aarch64_sve_mode_p (mode)) + /* SVE instructions operate on the full SVE register, even for + partial modes like VNx2SF whose GET_MODE_SIZE is only 8 bytes + under -msve-vector-bits=128. The elements of such partial + modes are strided across the register and can lie outside the + callee-preserved low 64 bits. Compare against the full SVE + vector size so that V8-V15 are correctly recognised as + partially clobbered for any SVE mode under AAPCS64/AAPCS_SIMD. */ + per_register_size = BYTES_PER_SVE_VECTOR; + else + { + per_register_size = GET_MODE_SIZE (mode); + unsigned int nregs = hard_regno_nregs (regno, mode); + if (nregs > 1) + per_register_size = exact_div (per_register_size, nregs); + } if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC) return maybe_gt (per_register_size, 16); return maybe_gt (per_register_size, 8); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122827.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122827.c new file mode 100644 index 000000000000..30477685448e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122827.c @@ -0,0 +1,96 @@ +/* PR target/122827. + + With -msve-vector-bits=128 and SVE-only autovec, the inlined + madd_v3 calls are vectorised into SVE partial-mode FMLAs that share + a single vec_duplicate of the scalar FAC. + Before the fix, the register allocator placed the VNx2SF + broadcast in V14 across the call to clobber() because IRA thought + V14 was preserved (only the low 64 bits, D14, are callee-saved under + AAPCS64). The SVE FMLA reads the full Z register, so the upper + lanes of the broadcast are corrupted by the call and the result of + the FMLA is wrong. */ + +/* { dg-do run { target aarch64_sve128_hw } } */ +/* { dg-options "-Ofast -msve-vector-bits=128 --param=aarch64-autovec-preference=sve-only" } */ +/* { dg-require-effective-target aarch64_little_endian } */ + +struct R +{ + float combined[3]; + float spec[3]; + float diff[3]; + float diffshad[3]; + float shad[3]; +}; + +struct I +{ + float fac; + int passflag; +}; + +/* AAPCS64 only requires callees to preserve the low 64 bits of V8-V15; + the upper lanes are caller-clobbered. The buggy register allocator + in this test places the broadcast in V14 specifically, so clobbering + V14s upper bits here is sufficient to expose the miscompile. The + asm clobber list lets GCC save/restore D14 around the write, so we + honour the ABI contract for the low half while leaving the upper + half trashed, what any normal callee is permitted to do. */ +__attribute__((noipa)) static void +clobber (struct I *si, struct R *r) +{ + asm volatile ("movi v14.16b, #0xff" : : : "v14"); +} + +static inline __attribute__((always_inline)) void +madd_v3 (float r[3], const float a[3], float f) +{ + r[0] += a[0] * f; + r[1] += a[1] * f; + r[2] += a[2] * f; +} + +__attribute__((noipa)) void +test (struct I *si, struct R *r) +{ + struct R t = { { 1, 2, 3 }, { 4, 5, 6 }, { 7, 8, 9 }, + { 10, 11, 12 }, { 13, 14, 15 } }; + + clobber (si, r); + if (si->fac != 0.0f) + { + float fac = si->fac; + clobber (si, &t); + madd_v3 (r->combined, t.combined, fac); + if (si->passflag & 1) + madd_v3 (r->spec, t.spec, fac); + if (si->passflag & 2) + { + madd_v3 (r->diff, t.diff, fac); + madd_v3 (r->diffshad, t.diffshad, fac); + } + if (si->passflag & 4) + madd_v3 (r->shad, t.shad, fac); + } +} + +int +main (void) +{ + struct I si = { 2.0f, 7 }; + struct R r = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, + { 0, 0, 0 }, { 0, 0, 0 } }; + test (&si, &r); + if (r.combined[0] != 2.f || r.combined[1] != 4.f || r.combined[2] != 6.f) + __builtin_abort (); + if (r.spec[0] != 8.f || r.spec[1] != 10.f || r.spec[2] != 12.f) + __builtin_abort (); + if (r.diff[0] != 14.f || r.diff[1] != 16.f || r.diff[2] != 18.f) + __builtin_abort (); + if (r.diffshad[0] != 20.f || r.diffshad[1] != 22.f || r.diffshad[2] != 24.f) + __builtin_abort (); + if (r.shad[0] != 26.f || r.shad[1] != 28.f || r.shad[2] != 30.f) + __builtin_abort (); + return 0; +} +
