SVE: Optimize vec_init for partial SVE vector modes

Christopher Bazley Tue, 10 Feb 2026 03:33:48 -0800

When basic block vectorization is extended to support predicated
vector tails, it attempts to vectorize more stores. This is only
done if the cost model deems it profitable, but the cost model
assumes that vec_init is cheap; in practice, that was not always
true.


For example,

  uint8_t * vectp.2689;
  vector([4,4]) unsigned char _846;
  vector([4,4]) <signed-boolean:4> slp_mask_848;
  ...
  _846 = {_20, _30, _40, _50};
  vectp.2689_847 = src_61(D) + 64;
  slp_mask_848 = .WHILE_ULT (0, 4, { 0, ... });
  .MASK_STORE (vectp.2689_847, 8B, slp_mask_848, _846);

was expected to have a vector cost of 4 (the same as the scalar
cost) but the code actually generated for

_846 = {_20, _30, _40, _50};

was a repetitive series of write-modify-read operations
using four stack locations for temporary storage:

(set (reg:VNx16BI Y)
        (const_vector:VNx16BI repeat [
                (const_int 1 [0x1])
            ]))

(set (mem/c:VNx4QI (plus:DI (reg/f:DI 96 virtual-stack-vars)
                (const_poly_int:DI [-O, -O])) [0  S[O, O] A8])
        (unspec:VNx4QI [
                (subreg:VNx4BI (reg:VNx16BI Y) 0)
                (reg:VNx4QI 205 [ _845 ])
            ] UNSPEC_PRED_X))

(set (reg:QI Z)
        (subreg:QI (reg:SI X [ _W ]) 0))

(set (mem/c:QI (plus:DI (reg/f:DI 96 virtual-stack-vars)
                (const_poly_int:DI [-O, -O])) [0  S1 A8])
        (reg:QI Z))

(set (reg:VNx4QI 205 [ _845 ])
        (unspec:VNx4QI [
                (subreg:VNx4BI (reg:VNx16BI V) 0)
                (mem/c:VNx4QI (plus:DI (reg/f:DI 96 virtual-stack-vars)
                        (const_poly_int:DI [-O, -O])) [0  S[O, O] A8])
            ] UNSPEC_PRED_X))

(repeated four times)

which compiled to something like:

        addpl   x5, sp, #6
        st1b {z27.s}, p7, sp, #3, mul vl
        strb w10, [x5]
        ld1b {z28.s}, p7/z, sp, #3, mul vl

(repeated four times)

With these changes, the compiled code is instead:

        mov     z31.b, w0
        insr    z31.s, s28
        insr    z31.s, s29
        insr    z31.s, s30

which is not yet optimal but is a great improvement.

To achieve that, "vec_init<mode><Vel>" was modified to
accept all SVE vector modes, which means that the
associated function aarch64_sve_expand_vector_init
must now handle all partial modes (namely, VNx8QI, VNx4QI,
VNx2QI, VNx4HI, VNx2HI, VNx2SI, VNx2HF, VNx4HF, VNx2SF,
VNx2BF, VNx4BF).

I verified that the following dependencies already
handle partial vector modes:
- "@aarch64_sve_<perm_insn><mode>" (for ZIP1)
- "*vec_duplicate<mode>_reg"
- maybe_code_for_aarch64_sve_rev

I did not verify that emit_move_insn (which is a dependency
of aarch64_sve_expand_vector_init_handle_trailing_constants)
handles partial vector modes, but it seems highly likely.

"vec_shl_insert_<mode>" has been modified to accept SVE_ALL
instead of only SVE_FULL and operate on container instead of
element types.

gcc/ChangeLog:

        * config/aarch64/aarch64-sve.md: Update
        vec_init<mode><Vel> and vec_shl_insert_<mode> to
        accept all SVE vector modes.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/slp_stack.c: New test.

---
 gcc/config/aarch64/aarch64-sve.md             | 16 +++++------
 .../gcc.target/aarch64/sve/slp_stack.c        | 27 +++++++++++++++++++
 2 files changed, 35 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_stack.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 97fd9516959..8f783afb51d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2945,7 +2945,7 @@ (define_insn "@aarch64_sve_ld1ro<mode>"
 ;; -------------------------------------------------------------------------
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:SVE_FULL 0 "register_operand")
+  [(match_operand:SVE_ALL 0 "register_operand")
     (match_operand 1 "")]
   "TARGET_SVE"
   {
@@ -2989,17 +2989,17 @@ (define_expand "vec_initvnx16qivnx2qi"
 
 ;; Shift an SVE vector left and insert a scalar into element 0.
 (define_insn "vec_shl_insert_<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-       (unspec:SVE_FULL
-         [(match_operand:SVE_FULL 1 "register_operand")
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+       (unspec:SVE_ALL
+         [(match_operand:SVE_ALL 1 "register_operand")
           (match_operand:<VEL> 2 "aarch64_reg_or_zero")]
          UNSPEC_INSR))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1 , 2  ; attrs: movprfx ]
-     [ ?w       , 0 , rZ ; *              ] insr\t%0.<Vetype>, %<vwcore>2
-     [ w        , 0 , w  ; *              ] insr\t%0.<Vetype>, %<Vetype>2
-     [ ??&w     , w , rZ ; yes            ] movprfx\t%0, 
%1\;insr\t%0.<Vetype>, %<vwcore>2
-     [ ?&w      , w , w  ; yes            ] movprfx\t%0, 
%1\;insr\t%0.<Vetype>, %<Vetype>2
+     [ ?w       , 0 , rZ ; *              ] insr\t%0.<Vctype>, %<vccore>2
+     [ w        , 0 , w  ; *              ] insr\t%0.<Vctype>, %<Vctype>2
+     [ ??&w     , w , rZ ; yes            ] movprfx\t%0, 
%1\;insr\t%0.<Vctype>, %<vccore>2
+     [ ?&w      , w , w  ; yes            ] movprfx\t%0, 
%1\;insr\t%0.<Vctype>, %<Vctype>2
   }
   [(set_attr "sve_type" "sve_int_general")]
 )
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_stack.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_stack.c
new file mode 100644
index 00000000000..76be816e0d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_stack.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mcpu=neoverse-n2 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+/* Without an efficient implementation of vec_init for partial SVE types, a
+   decision to vectorize a group in the basic block vectorizer can result in
+   code that repeatedly stores a whole vector on the stack, overwrites one
+   element, reloads the whole vector, stores it to another location,
+   overwrites another element, etc. This is a fairly minimal reproducer.  */
+void
+vec_slp_pathological_stack (uint8_t *src)
+{
+  int lt = src[-33];
+  int l0 = src[-1];
+  int l1 = src[31];
+  int t0 = src[-32];
+  int t1 = src[-31];
+  int t2 = src[-30];
+  src[64] = (l1 + (2 * l0) + lt + 2) >> 2;
+  src[65] = (lt + t0 + 1) >> 1;
+  src[66] = (t0 + t1 + 1) >> 1;
+  src[67] = (t1 + t2 + 1) >> 1;
+}
+
+/* { dg-final { scan-assembler-not {sp} } }
+ */
-- 
2.43.0

[PATCH v9 08/11] AArch64/SVE: Optimize vec_init for partial SVE vector modes

Reply via email to