On Wed, 19 Nov 2025, Tamar Christina wrote:
> > -----Original Message-----
> > From: Richard Biener <[email protected]>
> > Sent: 18 November 2025 18:11
> > To: [email protected]
> > Cc: Tamar Christina <[email protected]>; [email protected]
> > Subject: [PATCH] tree-optimization/122722 - better SLP reduction group
> > discovery
> >
> > The following improves the all-or-nothing discovery of reduction
> > groups to consider sub-groups by trying toplevel "matches" candidates
> > for this. For simplicity and to limit compile-time failed sub-group
> > matches are not decomposed further, only the originally failed part
> > is tried again to discover more sub-groups. Any remaining fails
> > get picked up by the current single-reduction handling.
> >
> > Such opportunities happen in hot spots of the ARM Adaptive Scalabe
> > Texture Compression Encoder (ASTC) where I ran into them, I'm not sure
> > to what extent VLA vector ISAs benefit from SLP reductions though,
> > the encoder has up to 32 concurrent reductions running. It probably
> > most benefits RVV, PowerPC and s390 as there's no HW mapping for its C++
> > vector implementation there. Of course the software path should also
> > benefit on x86.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> >
> > Any opinions?
>
> I think it's a good one even for VLA as the main benefit is that reduces
> the code of the vector loop when splitting the accumulator no?
>
> i.e. In the example given we no longer need to deinterleave the elements in
> the loop.
>
> It does make the epilogue more expensive because the deinterleaving moves
> there, but still that's once vs every iteration.
>
> It's too bad that we don't re-interleave at the main exit,
>
> e.g.
>
> void foo (int * __restrict sums, int *a, int *b, int n)
> {
> for (int i = 0; i < (n & -4); ++i)
> {
> sums[0] = sums[0] + a[2*i];
> sums[1] = sums[1] + a[2*i+1];
> sums[2] = sums[2] + b[2*i];
> sums[3] = sums[3] + b[2*i+1];
> }
> }
>
> Generates
>
> .L3:
> ldr q0, [x1, x4]
> ldr q29, [x2, x4]
> add x4, x4, 16
> add v30.4s, v30.4s, v0.4s
> add v31.4s, v31.4s, v29.4s
> cmp x3, x4
> bne .L3
> dup s26, v30.s[1]
> dup s24, v30.s[3]
> dup s28, v31.s[3]
> dup s25, v31.s[1]
> dup s27, v30.s[2]
> dup s29, v31.s[2]
> add v26.2s, v26.2s, v24.2s
> add v28.2s, v28.2s, v25.2s
> add v30.2s, v27.2s, v30.2s
> add v31.2s, v29.2s, v31.2s
> stp s30, s26, [x0]
> stp s31, s28, [x0, 8]
> .L1:
> ret
>
> which I think could be much better had we re-blended v30 and v31, which I
> think
> would have helped avoid the particularly bad VLA code in the epilogue
>
> .L3:
> ld1w z4.s, p7/z, [x1, x4, lsl 2]
> ld1w z3.s, p7/z, [x2, x4, lsl 2]
> add z0.s, p7/m, z0.s, z4.s
> add x4, x4, x5
> add z30.s, p7/m, z30.s, z3.s
> whilelo p7.s, x4, x3
> b.any .L3
> ptrue p7.b, all
> movi d29, #0
> index z2.s, #0, #1
> and z2.s, z2.s, #0x1
> cmpeq p15.s, p7/z, z2.s, z29.s
> index z1.s, #-1, #-1
> sel z28.s, p15, z0.s, z29.s
> sel z31.s, p15, z30.s, z29.s
> uaddv d28, p7, z28.s
> uaddv d31, p7, z31.s
> and z1.s, z1.s, #0x1
> cmpeq p15.s, p7/z, z1.s, z29.s
> sel z0.s, p15, z0.s, z29.s
> sel z29.s, p15, z30.s, z29.s
> uaddv d0, p7, z0.s
> uaddv d29, p7, z29.s
> stp s28, s0, [x0]
> stp s31, s29, [x0, 8]
> .L1:
> ret
>
> our cost models take this into account so I'm not overly concerned. So I
> think it's a good
> step forward!
>
> >
> > Thanks,
> > Richard.
> >
> > PR tree-optimization/122722
> > * tree-vect-slp.cc (vect_analyze_slp_reductions): New
> > function, split out from vect_analyze_slp. Try SLP
> > sub-groups.
> > (vect_analyze_slp_reduction_group): New helper.
> >
> > * gcc.dg/vect/slp-reduc-14.c: New testcase.
> > ---
> > gcc/testsuite/gcc.dg/vect/slp-reduc-14.c | 15 ++
> > gcc/tree-vect-slp.cc | 234 +++++++++++++++++------
> > 2 files changed, 191 insertions(+), 58 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> > b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> > new file mode 100644
> > index 00000000000..7966d23716a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> > +
> > +void foo (int * __restrict sums, int *a, int *b, int n)
> > +{
> > + for (int i = 0; i < n; ++i)
> > + {
> > + sums[0] = sums[0] + a[2*i];
> > + sums[1] = sums[1] + a[2*i+1];
> > + sums[2] = sums[2] + b[2*i];
> > + sums[3] = sums[3] + b[2*i+1];
> > + }
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "SLP discovery of size 2 reduction
> > group" 2 "vect" } } */
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index 07e22ea7ccf..3e403e7b1ea 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -4723,6 +4723,179 @@ vect_analyze_slp_reduction (loop_vec_info
> > vinfo,
> > return false;
> > }
> >
> > +/* Analyze a single SLP reduction group. If successful add a SLP instance
> > + for it and return true, otherwise return false and have *MATCHES
> > + populated. */
> > +
> > +static bool
> > +vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
> > + vec<stmt_vec_info> scalar_stmts,
> > + scalar_stmts_to_slp_tree_map_t *bst_map,
> > + unsigned max_tree_size, unsigned *limit,
> > + bool *matches)
> > +{
> > + /* Try to form a reduction group. */
> > + unsigned int group_size = scalar_stmts.length ();
> > + if (!matches)
> > + matches = XALLOCAVEC (bool, group_size);
> > + poly_uint64 max_nunits = 1;
> > + unsigned tree_size = 0;
> > + slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
> > + group_size,
> > + &max_nunits, matches, limit,
> > + &tree_size, bst_map);
> > + if (!node)
> > + return false;
> > +
> > + /* Create a new SLP instance. */
> > + slp_instance new_instance = XNEW (class _slp_instance);
> > + SLP_INSTANCE_TREE (new_instance) = node;
> > + SLP_INSTANCE_LOADS (new_instance) = vNULL;
> > + SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
> > + SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
> > + SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
> > + new_instance->reduc_phis = NULL;
> > + new_instance->cost_vec = vNULL;
> > + new_instance->subgraph_entries = vNULL;
> > +
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_NOTE, vect_location,
> > + "SLP size %u vs. limit %u.\n",
> > + tree_size, max_tree_size);
> > +
> > + loop_vinfo->slp_instances.safe_push (new_instance);
> > +
> > + /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
> > + the number of scalar stmts in the root in a few places.
> > + Verify that assumption holds. */
> > + gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE
> > (new_instance))
> > + .length () == group_size);
> > +
> > + if (dump_enabled_p ())
> > + {
> > + dump_printf_loc (MSG_NOTE, vect_location,
> > + "SLP discovery of size %d reduction group "
> > + "succeeded\n", group_size);
> > + dump_printf_loc (MSG_NOTE, vect_location,
> > + "Final SLP tree for instance %p:\n",
> > + (void *) new_instance);
> > + vect_print_slp_graph (MSG_NOTE, vect_location,
> > + SLP_INSTANCE_TREE (new_instance));
> > + }
> > +
> > + return true;
> > +}
> > +
> > +/* Analyze reductions in LOOP_VINFO and populate SLP instances
> > + accordingly. Returns false if something fails. */
> > +
> > +static bool
> > +vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
> > + unsigned max_tree_size, unsigned *limit,
> > + scalar_stmts_to_slp_tree_map_t *bst_map,
> > + bool force_single_lane)
> > +{
> > + if (loop_vinfo->reductions.is_empty ())
> > + return true;
> > +
> > + /* Collect reduction statements we can combine into
> > + a SLP reduction. */
> > + vec<stmt_vec_info> scalar_stmts;
> > + scalar_stmts.create (loop_vinfo->reductions.length ());
> > + for (auto next_info : loop_vinfo->reductions)
> > + {
> > + next_info = vect_stmt_to_vectorize (next_info);
> > + if ((STMT_VINFO_RELEVANT_P (next_info)
> > + || STMT_VINFO_LIVE_P (next_info))
> > + /* ??? Make sure we didn't skip a conversion around a
> > + reduction path. In that case we'd have to reverse
> > + engineer that conversion stmt following the chain using
> > + reduc_idx and from the PHI using reduc_def. */
> > + && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
> > + || (STMT_VINFO_DEF_TYPE (next_info)
> > + == vect_double_reduction_def)))
> > + {
> > + /* Do not discover SLP reductions combining lane-reducing
> > + ops, that will fail later. */
> > + if (!force_single_lane
> > + && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
> > + scalar_stmts.quick_push (next_info);
> > + /* Do SLP discovery for single-lane reductions. */
> > + else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
> > + max_tree_size, limit,
> > + bst_map,
> > + force_single_lane))
> > + return false;
> > + }
> > + }
> > +
> > + if (scalar_stmts.length () > 1)
> > + {
> > + /* Try to form a reduction group. */
> > + unsigned int group_size = scalar_stmts.length ();
> > + bool *matches = XALLOCAVEC (bool, group_size);
> > + if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
> > bst_map,
> > + max_tree_size, limit, matches))
> > + return true;
> > +
> > + /* When analysis as a single SLP reduction group failed try to
> > + form sub-groups by collecting matching lanes. Do not recurse
> > + that on failure (to limit compile-time costs), but recurse
> > + for the initial non-matching parts. Everything not covered
> > + by a sub-group gets single-reduction treatment. */
> > + vec<stmt_vec_info> cands;
> > + cands.create (group_size);
> > + while (matches[0])
> > + {
> > + cands.truncate (0);
> > + for (unsigned i = 0; i < group_size; ++i)
> > + if (matches[i])
> > + cands.quick_push (scalar_stmts[i]);
> > +
> > + /* Try to form a reduction group. */
> > + if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
> > + max_tree_size, limit, NULL))
> > + {
> > + cands = vNULL;
> > + cands.create (group_size);
> > + }
>
> Could you not move the create under the truncate (0); call above and remove
> the one
> outside the loop and then this block just becomes if
> (!vect_analyze_slp_reduction_group (...)
>
> the truncate should be safe on a null/empty vec.
If the vector is already allocated a .create () will cause a leak. I
could .release () it, but then I'd have a double allocation. I think
cands.truncate (0);
cands.reserve (group_size, true);
works.
Richard.
> Thanks,
> Tamar
>
> > + else
> > + {
> > + /* Do SLP discovery for single-lane reductions. */
> > + for (auto stmt_info : cands)
> > + if (! vect_analyze_slp_reduction (loop_vinfo,
> > + vect_stmt_to_vectorize
> > + (stmt_info),
> > + max_tree_size, limit,
> > + bst_map, force_single_lane))
> > + return false;
> > + }
> > + unsigned j = 0;
> > + for (unsigned i = 0; i < group_size; ++i)
> > + if (!matches[i])
> > + {
> > + scalar_stmts[j] = scalar_stmts[i];
> > + ++j;
> > + }
> > + scalar_stmts.truncate (j);
> > + group_size = scalar_stmts.length ();
> > + if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
> > + bst_map, max_tree_size, limit,
> > + matches))
> > + return true;
> > + }
> > + }
> > + /* Do SLP discovery for single-lane reductions. */
> > + for (auto stmt_info : scalar_stmts)
> > + if (! vect_analyze_slp_reduction (loop_vinfo,
> > + vect_stmt_to_vectorize (stmt_info),
> > + max_tree_size, limit,
> > + bst_map, force_single_lane))
> > + return false;
> > +
> > + return true;
> > +}
> > +
> > /* Analyze an SLP instance starting from a group of grouped stores. Call
> > vect_build_slp_tree to build a tree of packed stmts if possible.
> > Return FALSE if it's impossible to SLP any stmt in the group. */
> > @@ -5617,64 +5790,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned
> > max_tree_size,
> > if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > {
> > /* Find SLP sequences starting from groups of reductions. */
> > - if (loop_vinfo->reductions.length () > 0)
> > - {
> > - /* Collect reduction statements we can combine into
> > - a SLP reduction. */
> > - vec<stmt_vec_info> scalar_stmts;
> > - scalar_stmts.create (loop_vinfo->reductions.length ());
> > - for (auto next_info : loop_vinfo->reductions)
> > - {
> > - next_info = vect_stmt_to_vectorize (next_info);
> > - if ((STMT_VINFO_RELEVANT_P (next_info)
> > - || STMT_VINFO_LIVE_P (next_info))
> > - /* ??? Make sure we didn't skip a conversion around a
> > - reduction path. In that case we'd have to reverse
> > - engineer that conversion stmt following the chain using
> > - reduc_idx and from the PHI using reduc_def. */
> > - && (STMT_VINFO_DEF_TYPE (next_info) ==
> > vect_reduction_def
> > - || (STMT_VINFO_DEF_TYPE (next_info)
> > - == vect_double_reduction_def)))
> > - {
> > - /* Do not discover SLP reductions combining lane-reducing
> > - ops, that will fail later. */
> > - if (!force_single_lane
> > - && !lane_reducing_stmt_p (STMT_VINFO_STMT
> > (next_info)))
> > - scalar_stmts.quick_push (next_info);
> > - /* Do SLP discovery for single-lane reductions. */
> > - else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
> > - max_tree_size,
> > &limit,
> > - bst_map,
> > - force_single_lane))
> > - return opt_result::failure_at (vect_location,
> > - "SLP build failed.\n");
> > - }
> > - }
> > - /* Save for re-processing on failure. */
> > - vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
> > - vec<stmt_vec_info> roots = vNULL;
> > - vec<tree> remain = vNULL;
> > - if (scalar_stmts.length () <= 1
> > - || !vect_build_slp_instance (loop_vinfo,
> > - slp_inst_kind_reduc_group,
> > - scalar_stmts, roots, remain,
> > - max_tree_size, &limit, bst_map,
> > - force_single_lane))
> > - {
> > - if (scalar_stmts.length () <= 1)
> > - scalar_stmts.release ();
> > - /* Do SLP discovery for single-lane reductions. */
> > - for (auto stmt_info : saved_stmts)
> > - if (! vect_analyze_slp_reduction (loop_vinfo,
> > - vect_stmt_to_vectorize
> > - (stmt_info),
> > - max_tree_size, &limit,
> > - bst_map, force_single_lane))
> > - return opt_result::failure_at (vect_location,
> > - "SLP build failed.\n");
> > - }
> > - saved_stmts.release ();
> > - }
> > + if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
> > + bst_map, force_single_lane))
> > + return opt_result::failure_at (vect_location, "SLP build failed.\n");
> >
> > /* Make sure to vectorize only-live stmts, usually inductions. */
> > for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
> > --
> > 2.51.0
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)