https://gcc.gnu.org/g:5cb0842b8b86f8495a110018fb8af3c4c3316605

commit r17-1143-g5cb0842b8b86f8495a110018fb8af3c4c3316605
Author: Kyrylo Tkachov <[email protected]>
Date:   Wed May 27 13:47:48 2026 -0700

    PR target/122827: aarch64: Treat SVE modes as full-Z for callee-save 
accounting
    
    PR122827 is a miscompile of 526.blender_r in SPEC2017 with
    -Ofast -flto=auto -mcpu=neoverse-v2 -msve-vector-bits=128 --param 
aarch64-autovec-preference=sve-only.
    Bisection landed on g:b191e8bdecf (the IRA cost-model rework for PR117477), 
and the
    miscompile disappears with -mearly-ra=none, -fno-caller-saves, or
    -fno-early-ra, all of which steer register allocation away from V8-V15.
    
    The bug is in shade_material_loop in shadeinput.c, where a broadcast of
    a float scalar to a VNx2SF value is held in V14 across a call:
    
        mov     z14.s, s15            ; broadcast s15 -> all 4 .s lanes of z14
        ...
        bl      shade_lamp_loop       ; clobbers V14[127:64]
        ...
        fmla    z20.s, p7/m, z14.s, z21.s   ; reads all 4 .s lanes of z14
    
    Under AAPCS64 only the low 64 bits of V8-V15 (D8-D15) are callee-saved.
    The prologue dutifully saves d14/d15, but the broadcast and FMA both
    touch the full Z register, and the upper half of v14 gets corrupted by
    the call -- producing wrong floating-point results.
    
    VNx2SF has GET_MODE_SIZE == 8, so the previous part-clobber check
    treated V14 as fully preserved for that mode.  The mode size is
    correct for the *data* (two 32-bit lanes), but the underlying SVE
    instructions are not packed -- the two .s lanes live at byte offsets
    0-3 and 8-11 of the Z register (the .d-strided layout used for partial
    SVE modes), so byte 8 onwards is outside the AAPCS64-preserved range.
    Any SVE mode in V8-V15 across a call is therefore partially clobbered,
    regardless of GET_MODE_SIZE.
    
    The change in g:b191e8bdecf made the cost of allocating a callee-saved
    register low enough that IRA started picking V14 for these pseudos,
    exposing the latent miscompile.
    
    Two places need fixing:
    
      * aarch64_hard_regno_call_part_clobbered uses GET_MODE_SIZE to decide
        whether the callee-preserved low 64 bits are enough for the value.
        For SVE modes, compare against BYTES_PER_SVE_VECTOR instead: any
        SVE mode physically uses the full Z register.
    
      * early-RA classifies allocno groups into FPR_D/FPR_Q/FPR_Z by mode
        bit size, and partial_fpr_clobbers then maps FPR_D to V8QImode for
        the ABI check.  That synthetic V8QImode hides the fact that the
        pseudo is actually an SVE mode, so V8-V15 are not excluded from the
        candidate set.  Classify any SVE mode as FPR_Z so that
        partial_fpr_clobbers uses VNx16QImode and correctly removes V8-V15.
    
    A new test is added in gcc.target/aarch64/sve/pr122827.c
    Before the fix the execution test aborts (V14 is allocated to the broadcast
    across the call, and the explicit clobber inside the callee corrupts its 
upper
    lanes). after the fix it passes.
    
    Explicitly clobbering V14 with inline assembly in the test may be a bit 
fragile
    as it depends on RA picking V14 for the pre-call broadcast to trigger the 
bug
    but I couldn't find a good way of generalising it without perturbing RA in 
ways
    that made the bug not trigger.
    
    With this patch blender passes again and no regressions on
    aarch64-none-linux-gnu bootstrap.
    
    Signed-off-by: Kyrylo Tkachov <[email protected]>
    
    gcc/ChangeLog:
    
            PR target/122827
            * config/aarch64/aarch64.cc 
(aarch64_hard_regno_call_part_clobbered):
            For SVE modes use BYTES_PER_SVE_VECTOR for the per-register size
            rather than GET_MODE_SIZE.
            * config/aarch64/aarch64-early-ra.cc 
(early_ra::get_allocno_subgroup):
            Classify any SVE mode as FPR_Z.
    
    gcc/testsuite/ChangeLog:
    
            PR target/122827
            * gcc.target/aarch64/sve/pr122827.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-early-ra.cc          | 10 ++-
 gcc/config/aarch64/aarch64.cc                   | 21 ++++--
 gcc/testsuite/gcc.target/aarch64/sve/pr122827.c | 96 +++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-early-ra.cc 
b/gcc/config/aarch64/aarch64-early-ra.cc
index 40a305130725..24f0e1bdffb8 100644
--- a/gcc/config/aarch64/aarch64-early-ra.cc
+++ b/gcc/config/aarch64/aarch64-early-ra.cc
@@ -1520,7 +1520,15 @@ early_ra::get_allocno_subgroup (rtx reg)
       group->has_flexible_stride = ((flags & HAS_FLEXIBLE_STRIDE) != 0
                                    && (flags & HAS_FIXED_STRIDE) == 0);
 
-      group->fpr_size = (maybe_gt (fpr_bits, 128) ? FPR_Z
+      // SVE modes always occupy the full Z register, even for partial modes
+      // like VNx2SF whose mode size is only 64 bits.  Treating such modes
+      // as FPR_D would let partial_fpr_clobbers ignore the fact that
+      // V8-V15 are only partially callee-saved under AAPCS64 -- the high
+      // bits of the Z register are clobbered by calls.  Always classify
+      // SVE modes as FPR_Z so that V8-V15 are excluded as candidates for
+      // pseudos that are live across calls.
+      group->fpr_size = ((aarch64_sve_mode_p (GET_MODE (reg))
+                         || maybe_gt (fpr_bits, 128)) ? FPR_Z
                         : maybe_gt (fpr_bits, 64) ? FPR_Q : FPR_D);
 
       entry = group;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index ae93cb6bb2cd..5a859e12b1a5 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2939,10 +2939,23 @@ aarch64_hard_regno_call_part_clobbered (unsigned int 
abi_id,
       && abi_id != ARM_PCS_SVE
       && abi_id != ARM_PCS_PRESERVE_NONE)
     {
-      poly_int64 per_register_size = GET_MODE_SIZE (mode);
-      unsigned int nregs = hard_regno_nregs (regno, mode);
-      if (nregs > 1)
-       per_register_size = exact_div (per_register_size, nregs);
+      poly_int64 per_register_size;
+      if (aarch64_sve_mode_p (mode))
+       /* SVE instructions operate on the full SVE register, even for
+          partial modes like VNx2SF whose GET_MODE_SIZE is only 8 bytes
+          under -msve-vector-bits=128.  The elements of such partial
+          modes are strided across the register and can lie outside the
+          callee-preserved low 64 bits.  Compare against the full SVE
+          vector size so that V8-V15 are correctly recognised as
+          partially clobbered for any SVE mode under AAPCS64/AAPCS_SIMD.  */
+       per_register_size = BYTES_PER_SVE_VECTOR;
+      else
+       {
+         per_register_size = GET_MODE_SIZE (mode);
+         unsigned int nregs = hard_regno_nregs (regno, mode);
+         if (nregs > 1)
+           per_register_size = exact_div (per_register_size, nregs);
+       }
       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
        return maybe_gt (per_register_size, 16);
       return maybe_gt (per_register_size, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122827.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122827.c
new file mode 100644
index 000000000000..30477685448e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122827.c
@@ -0,0 +1,96 @@
+/* PR target/122827.
+
+   With -msve-vector-bits=128 and SVE-only autovec, the inlined
+   madd_v3 calls are vectorised into SVE partial-mode FMLAs that share
+   a single vec_duplicate of the scalar FAC.
+   Before the fix, the register allocator placed the VNx2SF
+   broadcast in V14 across the call to clobber() because IRA thought
+   V14 was preserved (only the low 64 bits, D14, are callee-saved under
+   AAPCS64).  The SVE FMLA reads the full Z register, so the upper
+   lanes of the broadcast are corrupted by the call and the result of
+   the FMLA is wrong.  */
+
+/* { dg-do run { target aarch64_sve128_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=128 
--param=aarch64-autovec-preference=sve-only" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+struct R
+{
+  float combined[3];
+  float spec[3];
+  float diff[3];
+  float diffshad[3];
+  float shad[3];
+};
+
+struct I
+{
+  float fac;
+  int passflag;
+};
+
+/* AAPCS64 only requires callees to preserve the low 64 bits of V8-V15;
+   the upper lanes are caller-clobbered.  The buggy register allocator
+   in this test places the broadcast in V14 specifically, so clobbering
+   V14s upper bits here is sufficient to expose the miscompile.  The
+   asm clobber list lets GCC save/restore D14 around the write, so we
+   honour the ABI contract for the low half while leaving the upper
+   half trashed, what any normal callee is permitted to do.  */
+__attribute__((noipa)) static void
+clobber (struct I *si, struct R *r)
+{
+  asm volatile ("movi v14.16b, #0xff" : : : "v14");
+}
+
+static inline __attribute__((always_inline)) void
+madd_v3 (float r[3], const float a[3], float f)
+{
+  r[0] += a[0] * f;
+  r[1] += a[1] * f;
+  r[2] += a[2] * f;
+}
+
+__attribute__((noipa)) void
+test (struct I *si, struct R *r)
+{
+  struct R t = { { 1, 2, 3 }, { 4, 5, 6 }, { 7, 8, 9 },
+                { 10, 11, 12 }, { 13, 14, 15 } };
+
+  clobber (si, r);
+  if (si->fac != 0.0f)
+    {
+      float fac = si->fac;
+      clobber (si, &t);
+      madd_v3 (r->combined, t.combined, fac);
+      if (si->passflag & 1)
+       madd_v3 (r->spec, t.spec, fac);
+      if (si->passflag & 2)
+       {
+         madd_v3 (r->diff, t.diff, fac);
+         madd_v3 (r->diffshad, t.diffshad, fac);
+       }
+      if (si->passflag & 4)
+       madd_v3 (r->shad, t.shad, fac);
+    }
+}
+
+int
+main (void)
+{
+  struct I si = { 2.0f, 7 };
+  struct R r = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+                { 0, 0, 0 }, { 0, 0, 0 } };
+  test (&si, &r);
+  if (r.combined[0] != 2.f || r.combined[1] != 4.f || r.combined[2] != 6.f)
+    __builtin_abort ();
+  if (r.spec[0] != 8.f || r.spec[1] != 10.f || r.spec[2] != 12.f)
+    __builtin_abort ();
+  if (r.diff[0] != 14.f || r.diff[1] != 16.f || r.diff[2] != 18.f)
+    __builtin_abort ();
+  if (r.diffshad[0] != 20.f || r.diffshad[1] != 22.f || r.diffshad[2] != 24.f)
+    __builtin_abort ();
+  if (r.shad[0] != 26.f || r.shad[1] != 28.f || r.shad[2] != 30.f)
+    __builtin_abort ();
+  return 0;
+}
+

Reply via email to