[Public]

Hi,

This patch enables BB SLP vectorization with reduction for  "sum of diffs" kind 
of patterns. Expanding the chain into statements with MINUS_EXPR is currently 
blocked, which can be enabled by turning on a flag, once the full support for 
that is implemented.

Bootstrapped and tested on x86.

Thank You,
Raghesh

gcc/ChangeLog:

        * tree-vect-slp.cc : (vect_slp_linearize_chain): Optional parameter 
allow_alt_code added (default true), check added not to follow MINUS_EXPR, when 
false
        (vect_slp_check_reduction_root): New function
        (vect_slp_check_for_roots): Calls to vect_slp_check_reduction_root

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/bb-slp-sum-of-diffs.c: New test.

>From 7a44a2f80e542ec21fb3a94a6f7a5c256f019792 Mon Sep 17 00:00:00 2001
From: Raghesh Aloor [email protected]<mailto:[email protected]>
Date: Wed, 11 Feb 2026 09:29:55 +0530
Subject: [PATCH v2] BB SLP: Enabling reduction root finding for sum-of-diff kind
of patterns

Add an optional parameter allow_alt_code to vect_slp_linearize_chain
(default true).  When false, do not follow into MINUS_EXPR when
building a PLUS reduction chain; treat MINUS results as leaves.

In vect_slp_check_for_roots, call vect_slp_check_reduction_root once
with allow_alt_code false so that "sum of diffs" (d_i = a[i]-b[i],
sum = d0+...+dN) is recognized and vectorized.  Pure PLUS chains
still work; other callers of vect_slp_linearize_chain keep the
default.  Once support for MINUS_EXPR in the chain is added, this
call site can be switched to allow_alt_code true.

New test: gcc.dg/vect/bb-slp-sum-of-diffs.c
---
.../gcc.dg/vect/bb-slp-sum-of-diffs.c         |  25 +++
gcc/tree-vect-slp.cc                          | 149 ++++++++++--------
2 files changed, 109 insertions(+), 65 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c
new file mode 100644
index 00000000000..a5f066a2692
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3 -fdump-tree-slp2" } */
+
+/* Test that SLP recognizes "sum of diffs" (d_i = a[i]-b[i], sum = d0+...+d7)
+   as a reduction chain and vectorizes the basic block.  */
+
+int __attribute__((noipa))
+sum_of_diffs (int *a, int *b)
+{
+  int d0 = a[0] - b[0];
+  int d1 = a[1] - b[1];
+  int d2 = a[2] - b[2];
+  int d3 = a[3] - b[3];
+  int d4 = a[4] - b[4];
+  int d5 = a[5] - b[5];
+  int d6 = a[6] - b[6];
+  int d7 = a[7] - b[7];
+
+  int sum = d0 + d1 + d2 + d3 + d4 + d5 + d6 + d7;
+  return sum;
+}
+
+/* Check that the reduction was SLP-vectorized (REDUC_PLUS appears in the 
tree).  */
+/* { dg-final { scan-tree-dump "\.REDUC_PLUS" "slp2" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c932e8d5cba..82d1110f3be 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1755,8 +1755,9 @@ dt_sort_cmp (const void *op1_, const void *op2_, void *)
    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
    filling CHAIN with the result and using WORKLIST as intermediate storage.
    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
-   or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
-   stmts, starting with START.  */
+   or MINUS_EXPR.     *CHAIN_STMTS if not NULL is filled with all computation
+   stmts, starting with START.  When ALLOW_ALT_CODE is false, do not
+   follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf).  */

static void
vect_slp_linearize_chain (vec_info *vinfo,
@@ -1764,7 +1765,8 @@ vect_slp_linearize_chain (vec_info *vinfo,
                                             vec<chain_op_t> &chain,
                                             enum tree_code code, gimple *start,
                                             gimple *&code_stmt, gimple 
*&alt_code_stmt,
-                                            vec<gimple *> *chain_stmts)
+                                           vec<gimple *> *chain_stmts,
+                                           bool allow_alt_code = true)
{
   /* For each lane linearize the addition/subtraction (or other
      uniform associatable operation) expression tree.  */
@@ -1800,7 +1802,8 @@ vect_slp_linearize_chain (vec_info *vinfo,
                    && single_imm_use (op, &use_p, &use_stmt)
                    && is_gimple_assign (def_stmt_info->stmt)
                    && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
-                             || (code == PLUS_EXPR
+                            || (allow_alt_code
+                                && code == PLUS_EXPR
                                   && (gimple_assign_rhs_code 
(def_stmt_info->stmt)
                                             == MINUS_EXPR))))
                  {
@@ -9874,6 +9877,75 @@ vect_slp_is_lane_insert (gimple *use_stmt, tree vec, 
unsigned *this_lane)
   return true;
}

+/* Check if ASSIGN is the end of a valid reduction chain and, if so, register
+   it as a reduction root in BB_VINFO->roots.  Linearize with ALLOW_ALT_CODE
+   (when false, do not follow into MINUS when building a PLUS chain).  Return
+   true if a root was registered, false otherwise.  */
+
+static bool
+vect_slp_check_reduction_root (bb_vec_info bb_vinfo,
+                                                vec<std::pair<tree_code, 
gimple *> > &worklist,
+                                                vec<chain_op_t> &chain,
+                                                enum tree_code code, gimple 
*assign,
+                                                gimple *&code_stmt, gimple 
*&alt_code_stmt,
+                                                vec<gimple *> *chain_stmts,
+                                                bool allow_alt_code)
+{
+  vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
+                                             code_stmt, alt_code_stmt, 
chain_stmts,
+                                             allow_alt_code);
+  if (chain.length () <= 1)
+    return false;
+  chain.sort (dt_sort_cmp, bb_vinfo);
+  /* ???  Now we'd want to strip externals and constants
+     but record those to be handled in the epilogue.  */
+  /* ???  For now do not allow mixing ops or externs/constants.  */
+  bool invalid = false;
+  unsigned remain_cnt = 0;
+  unsigned last_idx = 0;
+  for (unsigned i = 0; i < chain.length (); ++i)
+    {
+      if (chain[i].code != code)
+            {
+              invalid = true;
+              break;
+            }
+      if (chain[i].dt != vect_internal_def
+              || (gimple_get_lhs (bb_vinfo->lookup_def (chain[i].op)->stmt)
+                  != chain[i].op))
+            remain_cnt++;
+      else
+            last_idx = i;
+    }
+  if ((chain.length () - remain_cnt) & 1)
+    remain_cnt++;
+  if (invalid || chain.length () - remain_cnt <= 1)
+    return false;
+  vec<stmt_vec_info> stmts;
+  vec<tree> remain = vNULL;
+  stmts.create (chain.length ());
+  if (remain_cnt > 0)
+    remain.create (remain_cnt);
+  for (unsigned i = 0; i < chain.length (); ++i)
+    {
+      stmt_vec_info stmt_info;
+      if (chain[i].dt == vect_internal_def
+              && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
+                  gimple_get_lhs (stmt_info->stmt) == chain[i].op)
+              && (i != last_idx || (stmts.length () & 1)))
+            stmts.quick_push (stmt_info);
+      else
+            remain.quick_push (chain[i].op);
+    }
+  vec<stmt_vec_info> roots;
+  roots.create (chain_stmts->length ());
+  for (unsigned i = 0; i < chain_stmts->length (); ++i)
+    roots.quick_push (bb_vinfo->lookup_stmt ((*chain_stmts)[i]));
+  bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
+                                                              stmts, roots, 
remain));
+  return true;
+}
+
/* Find any vectorizable constructors and add them to the grouped_store
    array.  */

@@ -10046,67 +10118,14 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo)
                if (!reduction_fn_for_scalar_code (code, &reduc_fn)
                    || reduc_fn == IFN_LAST)
                  continue;
-               vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, 
assign,
-                                                            /* ??? */
-                                                            code_stmt, 
alt_code_stmt, &chain_stmts);
-               if (chain.length () > 1)
-                 {
-                   /* Sort the chain according to def_type and operation.  */
-                   chain.sort (dt_sort_cmp, bb_vinfo);
-                   /* ???  Now we'd want to strip externals and constants
-                           but record those to be handled in the epilogue.  */
-                   /* ???  For now do not allow mixing ops or 
externs/constants.  */
-                   bool invalid = false;
-                   unsigned remain_cnt = 0;
-                   unsigned last_idx = 0;
-                   for (unsigned i = 0; i < chain.length (); ++i)
-                           {
-                             if (chain[i].code != code)
-                               {
-                                 invalid = true;
-                                 break;
-                               }
-                             if (chain[i].dt != vect_internal_def
-                                 /* Avoid stmts where the def is not the LHS, 
like
-                                          ASMs.  */
-                                 || (gimple_get_lhs (bb_vinfo->lookup_def
-                                                                               
            (chain[i].op)->stmt)
-                                            != chain[i].op))
-                               remain_cnt++;
-                             else
-                               last_idx = i;
-                           }
-                   /* Make sure to have an even number of lanes as we later do
-                           all-or-nothing discovery, not trying to split 
further.  */
-                   if ((chain.length () - remain_cnt) & 1)
-                           remain_cnt++;
-                   if (!invalid && chain.length () - remain_cnt > 1)
-                           {
-                             vec<stmt_vec_info> stmts;
-                             vec<tree> remain = vNULL;
-                             stmts.create (chain.length ());
-                             if (remain_cnt > 0)
-                               remain.create (remain_cnt);
-                             for (unsigned i = 0; i < chain.length (); ++i)
-                               {
-                                 stmt_vec_info stmt_info;
-                                 if (chain[i].dt == vect_internal_def
-                                            && ((stmt_info = 
bb_vinfo->lookup_def (chain[i].op)),
-                                                gimple_get_lhs 
(stmt_info->stmt) == chain[i].op)
-                                            && (i != last_idx
-                                                || (stmts.length () & 1)))
-                                          stmts.quick_push (stmt_info);
-                                 else
-                                          remain.quick_push (chain[i].op);
-                               }
-                             vec<stmt_vec_info> roots;
-                             roots.create (chain_stmts.length ());
-                             for (unsigned i = 0; i < chain_stmts.length (); 
++i)
-                               roots.quick_push (bb_vinfo->lookup_stmt 
(chain_stmts[i]));
-                             bb_vinfo->roots.safe_push (slp_root 
(slp_inst_kind_bb_reduc,
-                                                                               
             stmts, roots, remain));
-                           }
-                 }
+              /* Build chain with allow_alt_code false: do not follow 
MINUS_EXPR
+                 when building a PLUS chain (treat MINUS as leaf).  This 
recognizes
+                 sum-of-diffs (e.g. d_i = a_i - b_i, sum = d0 + d1 + ...).  
Once
+                 support for mixing MINUS_EXPR in the chain is added, this can 
be
+                 called with allow_alt_code true.  */
+              vect_slp_check_reduction_root (bb_vinfo, worklist, chain, code,
+                                                                     assign, 
code_stmt, alt_code_stmt,
+                                                                     
&chain_stmts, false);
              }
     }
}
--
2.34.1

Reply via email to