> -----Original Message-----
> From: Richard Biener <[email protected]>
> Sent: 18 November 2025 18:11
> To: [email protected]
> Cc: Tamar Christina <[email protected]>; [email protected]
> Subject: [PATCH] tree-optimization/122722 - better SLP reduction group
> discovery
>
> The following improves the all-or-nothing discovery of reduction
> groups to consider sub-groups by trying toplevel "matches" candidates
> for this. For simplicity and to limit compile-time failed sub-group
> matches are not decomposed further, only the originally failed part
> is tried again to discover more sub-groups. Any remaining fails
> get picked up by the current single-reduction handling.
>
> Such opportunities happen in hot spots of the ARM Adaptive Scalabe
> Texture Compression Encoder (ASTC) where I ran into them, I'm not sure
> to what extent VLA vector ISAs benefit from SLP reductions though,
> the encoder has up to 32 concurrent reductions running. It probably
> most benefits RVV, PowerPC and s390 as there's no HW mapping for its C++
> vector implementation there. Of course the software path should also
> benefit on x86.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Any opinions?
I think it's a good one even for VLA as the main benefit is that reduces
the code of the vector loop when splitting the accumulator no?
i.e. In the example given we no longer need to deinterleave the elements in
the loop.
It does make the epilogue more expensive because the deinterleaving moves
there, but still that's once vs every iteration.
It's too bad that we don't re-interleave at the main exit,
e.g.
void foo (int * __restrict sums, int *a, int *b, int n)
{
for (int i = 0; i < (n & -4); ++i)
{
sums[0] = sums[0] + a[2*i];
sums[1] = sums[1] + a[2*i+1];
sums[2] = sums[2] + b[2*i];
sums[3] = sums[3] + b[2*i+1];
}
}
Generates
.L3:
ldr q0, [x1, x4]
ldr q29, [x2, x4]
add x4, x4, 16
add v30.4s, v30.4s, v0.4s
add v31.4s, v31.4s, v29.4s
cmp x3, x4
bne .L3
dup s26, v30.s[1]
dup s24, v30.s[3]
dup s28, v31.s[3]
dup s25, v31.s[1]
dup s27, v30.s[2]
dup s29, v31.s[2]
add v26.2s, v26.2s, v24.2s
add v28.2s, v28.2s, v25.2s
add v30.2s, v27.2s, v30.2s
add v31.2s, v29.2s, v31.2s
stp s30, s26, [x0]
stp s31, s28, [x0, 8]
.L1:
ret
which I think could be much better had we re-blended v30 and v31, which I think
would have helped avoid the particularly bad VLA code in the epilogue
.L3:
ld1w z4.s, p7/z, [x1, x4, lsl 2]
ld1w z3.s, p7/z, [x2, x4, lsl 2]
add z0.s, p7/m, z0.s, z4.s
add x4, x4, x5
add z30.s, p7/m, z30.s, z3.s
whilelo p7.s, x4, x3
b.any .L3
ptrue p7.b, all
movi d29, #0
index z2.s, #0, #1
and z2.s, z2.s, #0x1
cmpeq p15.s, p7/z, z2.s, z29.s
index z1.s, #-1, #-1
sel z28.s, p15, z0.s, z29.s
sel z31.s, p15, z30.s, z29.s
uaddv d28, p7, z28.s
uaddv d31, p7, z31.s
and z1.s, z1.s, #0x1
cmpeq p15.s, p7/z, z1.s, z29.s
sel z0.s, p15, z0.s, z29.s
sel z29.s, p15, z30.s, z29.s
uaddv d0, p7, z0.s
uaddv d29, p7, z29.s
stp s28, s0, [x0]
stp s31, s29, [x0, 8]
.L1:
ret
our cost models take this into account so I'm not overly concerned. So I think
it's a good
step forward!
>
> Thanks,
> Richard.
>
> PR tree-optimization/122722
> * tree-vect-slp.cc (vect_analyze_slp_reductions): New
> function, split out from vect_analyze_slp. Try SLP
> sub-groups.
> (vect_analyze_slp_reduction_group): New helper.
>
> * gcc.dg/vect/slp-reduc-14.c: New testcase.
> ---
> gcc/testsuite/gcc.dg/vect/slp-reduc-14.c | 15 ++
> gcc/tree-vect-slp.cc | 234 +++++++++++++++++------
> 2 files changed, 191 insertions(+), 58 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> new file mode 100644
> index 00000000000..7966d23716a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +
> +void foo (int * __restrict sums, int *a, int *b, int n)
> +{
> + for (int i = 0; i < n; ++i)
> + {
> + sums[0] = sums[0] + a[2*i];
> + sums[1] = sums[1] + a[2*i+1];
> + sums[2] = sums[2] + b[2*i];
> + sums[3] = sums[3] + b[2*i+1];
> + }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "SLP discovery of size 2 reduction
> group" 2 "vect" } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 07e22ea7ccf..3e403e7b1ea 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -4723,6 +4723,179 @@ vect_analyze_slp_reduction (loop_vec_info
> vinfo,
> return false;
> }
>
> +/* Analyze a single SLP reduction group. If successful add a SLP instance
> + for it and return true, otherwise return false and have *MATCHES
> + populated. */
> +
> +static bool
> +vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
> + vec<stmt_vec_info> scalar_stmts,
> + scalar_stmts_to_slp_tree_map_t *bst_map,
> + unsigned max_tree_size, unsigned *limit,
> + bool *matches)
> +{
> + /* Try to form a reduction group. */
> + unsigned int group_size = scalar_stmts.length ();
> + if (!matches)
> + matches = XALLOCAVEC (bool, group_size);
> + poly_uint64 max_nunits = 1;
> + unsigned tree_size = 0;
> + slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
> + group_size,
> + &max_nunits, matches, limit,
> + &tree_size, bst_map);
> + if (!node)
> + return false;
> +
> + /* Create a new SLP instance. */
> + slp_instance new_instance = XNEW (class _slp_instance);
> + SLP_INSTANCE_TREE (new_instance) = node;
> + SLP_INSTANCE_LOADS (new_instance) = vNULL;
> + SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
> + SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
> + SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
> + new_instance->reduc_phis = NULL;
> + new_instance->cost_vec = vNULL;
> + new_instance->subgraph_entries = vNULL;
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "SLP size %u vs. limit %u.\n",
> + tree_size, max_tree_size);
> +
> + loop_vinfo->slp_instances.safe_push (new_instance);
> +
> + /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
> + the number of scalar stmts in the root in a few places.
> + Verify that assumption holds. */
> + gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE
> (new_instance))
> + .length () == group_size);
> +
> + if (dump_enabled_p ())
> + {
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "SLP discovery of size %d reduction group "
> + "succeeded\n", group_size);
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "Final SLP tree for instance %p:\n",
> + (void *) new_instance);
> + vect_print_slp_graph (MSG_NOTE, vect_location,
> + SLP_INSTANCE_TREE (new_instance));
> + }
> +
> + return true;
> +}
> +
> +/* Analyze reductions in LOOP_VINFO and populate SLP instances
> + accordingly. Returns false if something fails. */
> +
> +static bool
> +vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
> + unsigned max_tree_size, unsigned *limit,
> + scalar_stmts_to_slp_tree_map_t *bst_map,
> + bool force_single_lane)
> +{
> + if (loop_vinfo->reductions.is_empty ())
> + return true;
> +
> + /* Collect reduction statements we can combine into
> + a SLP reduction. */
> + vec<stmt_vec_info> scalar_stmts;
> + scalar_stmts.create (loop_vinfo->reductions.length ());
> + for (auto next_info : loop_vinfo->reductions)
> + {
> + next_info = vect_stmt_to_vectorize (next_info);
> + if ((STMT_VINFO_RELEVANT_P (next_info)
> + || STMT_VINFO_LIVE_P (next_info))
> + /* ??? Make sure we didn't skip a conversion around a
> + reduction path. In that case we'd have to reverse
> + engineer that conversion stmt following the chain using
> + reduc_idx and from the PHI using reduc_def. */
> + && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
> + || (STMT_VINFO_DEF_TYPE (next_info)
> + == vect_double_reduction_def)))
> + {
> + /* Do not discover SLP reductions combining lane-reducing
> + ops, that will fail later. */
> + if (!force_single_lane
> + && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
> + scalar_stmts.quick_push (next_info);
> + /* Do SLP discovery for single-lane reductions. */
> + else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
> + max_tree_size, limit,
> + bst_map,
> + force_single_lane))
> + return false;
> + }
> + }
> +
> + if (scalar_stmts.length () > 1)
> + {
> + /* Try to form a reduction group. */
> + unsigned int group_size = scalar_stmts.length ();
> + bool *matches = XALLOCAVEC (bool, group_size);
> + if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
> bst_map,
> + max_tree_size, limit, matches))
> + return true;
> +
> + /* When analysis as a single SLP reduction group failed try to
> + form sub-groups by collecting matching lanes. Do not recurse
> + that on failure (to limit compile-time costs), but recurse
> + for the initial non-matching parts. Everything not covered
> + by a sub-group gets single-reduction treatment. */
> + vec<stmt_vec_info> cands;
> + cands.create (group_size);
> + while (matches[0])
> + {
> + cands.truncate (0);
> + for (unsigned i = 0; i < group_size; ++i)
> + if (matches[i])
> + cands.quick_push (scalar_stmts[i]);
> +
> + /* Try to form a reduction group. */
> + if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
> + max_tree_size, limit, NULL))
> + {
> + cands = vNULL;
> + cands.create (group_size);
> + }
Could you not move the create under the truncate (0); call above and remove the
one
outside the loop and then this block just becomes if
(!vect_analyze_slp_reduction_group (...)
the truncate should be safe on a null/empty vec.
Thanks,
Tamar
> + else
> + {
> + /* Do SLP discovery for single-lane reductions. */
> + for (auto stmt_info : cands)
> + if (! vect_analyze_slp_reduction (loop_vinfo,
> + vect_stmt_to_vectorize
> + (stmt_info),
> + max_tree_size, limit,
> + bst_map, force_single_lane))
> + return false;
> + }
> + unsigned j = 0;
> + for (unsigned i = 0; i < group_size; ++i)
> + if (!matches[i])
> + {
> + scalar_stmts[j] = scalar_stmts[i];
> + ++j;
> + }
> + scalar_stmts.truncate (j);
> + group_size = scalar_stmts.length ();
> + if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
> + bst_map, max_tree_size, limit,
> + matches))
> + return true;
> + }
> + }
> + /* Do SLP discovery for single-lane reductions. */
> + for (auto stmt_info : scalar_stmts)
> + if (! vect_analyze_slp_reduction (loop_vinfo,
> + vect_stmt_to_vectorize (stmt_info),
> + max_tree_size, limit,
> + bst_map, force_single_lane))
> + return false;
> +
> + return true;
> +}
> +
> /* Analyze an SLP instance starting from a group of grouped stores. Call
> vect_build_slp_tree to build a tree of packed stmts if possible.
> Return FALSE if it's impossible to SLP any stmt in the group. */
> @@ -5617,64 +5790,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned
> max_tree_size,
> if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> {
> /* Find SLP sequences starting from groups of reductions. */
> - if (loop_vinfo->reductions.length () > 0)
> - {
> - /* Collect reduction statements we can combine into
> - a SLP reduction. */
> - vec<stmt_vec_info> scalar_stmts;
> - scalar_stmts.create (loop_vinfo->reductions.length ());
> - for (auto next_info : loop_vinfo->reductions)
> - {
> - next_info = vect_stmt_to_vectorize (next_info);
> - if ((STMT_VINFO_RELEVANT_P (next_info)
> - || STMT_VINFO_LIVE_P (next_info))
> - /* ??? Make sure we didn't skip a conversion around a
> - reduction path. In that case we'd have to reverse
> - engineer that conversion stmt following the chain using
> - reduc_idx and from the PHI using reduc_def. */
> - && (STMT_VINFO_DEF_TYPE (next_info) ==
> vect_reduction_def
> - || (STMT_VINFO_DEF_TYPE (next_info)
> - == vect_double_reduction_def)))
> - {
> - /* Do not discover SLP reductions combining lane-reducing
> - ops, that will fail later. */
> - if (!force_single_lane
> - && !lane_reducing_stmt_p (STMT_VINFO_STMT
> (next_info)))
> - scalar_stmts.quick_push (next_info);
> - /* Do SLP discovery for single-lane reductions. */
> - else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
> - max_tree_size,
> &limit,
> - bst_map,
> - force_single_lane))
> - return opt_result::failure_at (vect_location,
> - "SLP build failed.\n");
> - }
> - }
> - /* Save for re-processing on failure. */
> - vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
> - vec<stmt_vec_info> roots = vNULL;
> - vec<tree> remain = vNULL;
> - if (scalar_stmts.length () <= 1
> - || !vect_build_slp_instance (loop_vinfo,
> - slp_inst_kind_reduc_group,
> - scalar_stmts, roots, remain,
> - max_tree_size, &limit, bst_map,
> - force_single_lane))
> - {
> - if (scalar_stmts.length () <= 1)
> - scalar_stmts.release ();
> - /* Do SLP discovery for single-lane reductions. */
> - for (auto stmt_info : saved_stmts)
> - if (! vect_analyze_slp_reduction (loop_vinfo,
> - vect_stmt_to_vectorize
> - (stmt_info),
> - max_tree_size, &limit,
> - bst_map, force_single_lane))
> - return opt_result::failure_at (vect_location,
> - "SLP build failed.\n");
> - }
> - saved_stmts.release ();
> - }
> + if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
> + bst_map, force_single_lane))
> + return opt_result::failure_at (vect_location, "SLP build failed.\n");
>
> /* Make sure to vectorize only-live stmts, usually inductions. */
> for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
> --
> 2.51.0