On Fri, 10 Oct 2025, Zhongyao Chen wrote:

> Enable SLP vectorization for signed integer reductions by teaching the
> vectorizer to walk PLUS_EXPR trees directly, complementing reassociation
> which skips signed types due to undefined overflow semantics.

Thanks for working on this.  Note this is not the direction I want
the code to do - instead vect_is_simple_reduction should no longer
decide on what is a reduction chain and what not but SLP discovery
should (I started some refactoring there to work towards this a 
few weeks ago but got distracted).  On the SLP side there's the
vect_slp_linearize_chain toolbox that could be extended for this.

That said, what I did not get to yet is to move reduction chain
discovery to vect_analyze_slp () time, thus get rid of
loop_vinfo->reduction_chains and only work from the reductions.

What seems to be missing with your attempt is that if we re-associate
signed operations, we have to perform the operations in an unsigned
type.

Richard.

>       PR tree-optimization/120687
> 
> gcc/ChangeLog:
> 
>       * tree-vect-loop.cc (MAX_SLP_REDUCTION_TREE_DEPTH): New constant.
>       (vect_collect_slp_reduction_ops, vect_merge_slp_reduction_ops,
>       vect_try_extend_slp_reduction_chain): New functions.
>       (vect_is_simple_reduction): Extend PLUS_EXPR reduction chains.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/vect/pr120687-4.c: New test.
> 
> Signed-off-by: Zhongyao Chen <[email protected]>
> ---
>  gcc/testsuite/gcc.dg/vect/pr120687-4.c |  16 ++
>  gcc/tree-vect-loop.cc                  | 253 +++++++++++++++++++++++++
>  2 files changed, 269 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/pr120687-4.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-4.c 
> b/gcc/testsuite/gcc.dg/vect/pr120687-4.c
> new file mode 100644
> index 00000000000..2ac40ee5c4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr120687-4.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +
> +typedef int TYPE;
> +TYPE frd (TYPE *p, TYPE *lastone)
> +{
> +  TYPE sum = 0;
> +  for (; p <= lastone; p += 16)
> +    sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7]
> +           + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15];
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" 
> } } */
> +/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 73398e58fdc..e998544380b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3709,6 +3709,254 @@ check_reduction_path (dump_user_location_t loc, 
> loop_p loop, gphi *phi,
>  }
>  
>  
> +/* Cap the tree walk depth to avoid stack overflows.  */
> +static const unsigned MAX_SLP_REDUCTION_TREE_DEPTH = 128;
> +
> +/* Function vect_collect_slp_reduction_ops
> +   Walk the PLUS tree rooted at OPERAND and push defs in post-order.
> +
> +   Example:
> +        sum'
> +        /  \
> +     tmp0  tmp1
> +     / \   / \
> +     a0  a1 a2  a3
> +
> +   Push result: [tmp0, tmp1, sum']
> +
> +   Keep only PLUS_EXPR inside LOOP with a single use and depth bounded by
> +   MAX_SLP_REDUCTION_TREE_DEPTH.  Value-preserving conversions are allowed
> +   when overflow semantics match.  Any violation aborts with false.  */
> +
> +static bool
> +vect_collect_slp_reduction_ops (loop_vec_info loop_info, class loop *loop,
> +                                 enum tree_code op_code, tree operand,
> +                         vec<stmt_vec_info, va_heap> &stmt_ops,
> +                         bitmap visited_ssa, unsigned depth)
> +{
> +  if (depth > MAX_SLP_REDUCTION_TREE_DEPTH)
> +    {
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: depth %u over limit for %T\n",
> +                    depth, operand);
> +      return false;
> +    }
> +
> +  if (TREE_CODE (operand) != SSA_NAME)
> +    {
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: reached leaf %T\n", operand);
> +      return true;
> +    }
> +
> +  unsigned version = SSA_NAME_VERSION (operand);
> +  if (!bitmap_set_bit (visited_ssa, version))
> +    {
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: already visited %T\n", operand);
> +      return true;
> +    }
> +
> +  stmt_vec_info stmt_info = loop_info->lookup_def (operand);
> +  if (!stmt_info)
> +    {
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: no stmt info for %T\n", operand);
> +      return true;
> +    }
> +
> +  stmt_info = vect_stmt_to_vectorize (stmt_info);
> +  gimple *stmt = stmt_info->stmt;
> +
> +  if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
> +    {
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: %G defined outside loop\n", stmt);
> +      return true;
> +    }
> +
> +  if (!is_gimple_assign (stmt))
> +    {
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: skip non-assign stmt %G\n", stmt);
> +      return true;
> +    }
> +
> +  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
> +
> +  if (rhs_code == op_code)
> +    {
> +      tree lhs = gimple_assign_lhs (stmt);
> +      if (!has_single_use (lhs))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                            "slp collect: multiple uses of %T\n", lhs);
> +       return false;
> +     }
> +
> +      tree op0 = gimple_assign_rhs1 (stmt);
> +      tree op1 = gimple_assign_rhs2 (stmt);
> +
> +      if (!vect_collect_slp_reduction_ops (loop_info, loop, op_code, op0,
> +                                   stmt_ops, visited_ssa, depth + 1))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                     "slp collect: left operand %T unsuitable\n", op0);
> +       return false;
> +     }
> +      if (!vect_collect_slp_reduction_ops (loop_info, loop, op_code, op1,
> +                                   stmt_ops, visited_ssa, depth + 1))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                     "slp collect: right operand %T unsuitable\n", op1);
> +       return false;
> +     }
> +
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: push %G\n", stmt);
> +      stmt_ops.safe_push (stmt_info);
> +      return true;
> +    }
> +
> +  if (CONVERT_EXPR_CODE_P (rhs_code)
> +      && tree_nop_conversion_p (TREE_TYPE (gimple_assign_rhs1 (stmt)),
> +                                    TREE_TYPE (gimple_assign_lhs (stmt))))
> +    {
> +      /* Stop if overflow semantics differ.  */
> +      if (TYPE_OVERFLOW_WRAPS (TREE_TYPE (gimple_assign_rhs1 (stmt)))
> +       != TYPE_OVERFLOW_WRAPS (TREE_TYPE (gimple_assign_lhs (stmt))))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                            "slp collect: stop at %G (overflow mismatch)\n",
> +                            stmt);
> +       return true;
> +     }
> +      return vect_collect_slp_reduction_ops (loop_info, loop, op_code,
> +                                          gimple_assign_rhs1 (stmt),
> +                                          stmt_ops, visited_ssa, depth + 1);
> +    }
> +
> +  if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                    "slp collect: treat %G as terminal\n", stmt);
> +  return true;
> +}
> +
> +/* Merge EXTRA_OPS into REDUC_CHAIN, dropping duplicates.
> +
> +   Example:
> +   reduc_chain: [t2, root]
> +   extra_ops:   [t0, t1, root]
> +   After merge: [t0, t1, t2, root]
> +
> +   Chains stay short, so the scan keeps the logic simple.  */
> +
> +static void
> +vect_merge_slp_reduction_ops (vec<stmt_vec_info> &reduc_chain,
> +                             vec<stmt_vec_info, va_heap> &extra_ops)
> +{
> +  if (extra_ops.is_empty ())
> +    return;
> +
> +  auto_vec<stmt_vec_info, 16> new_chain;
> +
> +  for (unsigned j = 0; j < extra_ops.length (); ++j)
> +    {
> +      stmt_vec_info info = extra_ops[j];
> +      bool found = false;
> +      for (unsigned k = 0; k < reduc_chain.length (); ++k)
> +     if (reduc_chain[k] == info)
> +       {
> +         found = true;
> +         break;
> +       }
> +      if (!found)
> +     new_chain.safe_push (info);
> +    }
> +
> +  if (new_chain.is_empty ())
> +    return;
> +
> +  for (unsigned j = 0; j < reduc_chain.length (); ++j)
> +    new_chain.safe_push (reduc_chain[j]);
> +
> +  reduc_chain.truncate (0);
> +  for (unsigned j = 0; j < new_chain.length (); ++j)
> +    reduc_chain.safe_push (new_chain[j]);
> +}
> +
> +/* Function vect_try_extend_slp_reduction_chain
> +
> +   Reassociation leaves signed reductions as a lone root, so we walk the 
> other
> +   operand (a PLUS tree) and prepend its statements.  Example:
> +
> +     root: sum' = sum + partial
> +     partial
> +     |-- tmp0 = a0 + a1
> +     `-- tmp1 = a2 + a3
> +     REDUC_CHAIN before: [root]
> +     REDUC_CHAIN after:  [tmp0, tmp1, root]
> +
> +   Accept only PLUS_EXPR with a single use, matching overflow semantics and
> +   bounded depth; fold-left lowering is handled later by
> +   vect_transform_reduction ().  */
> +
> +static bool
> +vect_try_extend_slp_reduction_chain (loop_vec_info loop_info, class loop 
> *loop,
> +                                     enum tree_code op_code,
> +                                     vec<stmt_vec_info> &reduc_chain)
> +{
> +  if (reduc_chain.length () != 1)
> +    return false;
> +
> +  if (op_code != PLUS_EXPR)
> +    return false;
> +
> +  stmt_vec_info root = vect_stmt_to_vectorize (reduc_chain.last ());
> +  gimple *root_stmt = root->stmt;
> +  if (!is_gimple_assign (root_stmt))
> +    return false;
> +
> +  int reduc_idx = STMT_VINFO_REDUC_IDX (root);
> +  if (reduc_idx < 0)
> +    return false;
> +
> +  gcc_assert (gimple_num_ops (root_stmt) >= 3);
> +
> +  tree candidate = (reduc_idx == 0
> +                     ? gimple_assign_rhs2 (root_stmt)
> +                     : gimple_assign_rhs1 (root_stmt));
> +  gcc_assert (candidate != NULL_TREE);
> +
> +  auto_vec<stmt_vec_info, 16> extra_ops;
> +  auto_bitmap visited_ssa;
> +  if (!vect_collect_slp_reduction_ops (loop_info, loop, op_code, candidate,
> +                       extra_ops, visited_ssa, 0))
> +    return false;
> +
> +  if (extra_ops.is_empty ())
> +    return false;
> +
> +  vect_merge_slp_reduction_ops (reduc_chain, extra_ops);
> +
> +  if (dump_enabled_p ())
> +    dump_printf_loc (MSG_NOTE, vect_location,
> +                  "extended SLP reduc_chain to %u stmts\n",
> +                  reduc_chain.length ());
> +
> +  return true;
> +}
>  
>  /* Function vect_is_simple_reduction
>  
> @@ -3953,6 +4201,11 @@ vect_is_simple_reduction (loop_vec_info loop_info, 
> stmt_vec_info phi_info,
>           continue;
>         reduc_chain.safe_push (stmt_info);
>       }
> +      if (slp && is_slp_reduc && code.is_tree_code ())
> +     /* Try to extend a singleton chain via a PLUS tree walk.  */
> +     vect_try_extend_slp_reduction_chain (loop_info, loop,
> +                                            tree_code (code),
> +                                            reduc_chain);
>        if (slp && is_slp_reduc && reduc_chain.length () > 1)
>       {
>         for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to