[Public]
Hi,
This patch enables BB SLP vectorization with reduction for "sum of diffs" kind
of patterns. Expanding the chain into statements with MINUS_EXPR is currently
blocked, which can be enabled by turning on a flag, once the full support for
that is implemented.
Bootstrapped and tested on x86.
Thank You,
Raghesh
gcc/ChangeLog:
* tree-vect-slp.cc : (vect_slp_linearize_chain): Optional parameter
allow_alt_code added (default true), check added not to follow MINUS_EXPR, when
false
(vect_slp_check_reduction_root): New function
(vect_slp_check_for_roots): Calls to vect_slp_check_reduction_root
gcc/testsuite/ChangeLog:
* gcc.dg/vect/bb-slp-sum-of-diffs.c: New test.
>From 7a44a2f80e542ec21fb3a94a6f7a5c256f019792 Mon Sep 17 00:00:00 2001
From: Raghesh Aloor [email protected]<mailto:[email protected]>
Date: Wed, 11 Feb 2026 09:29:55 +0530
Subject: [PATCH v2] BB SLP: Enabling reduction root finding for sum-of-diff kind
of patterns
Add an optional parameter allow_alt_code to vect_slp_linearize_chain
(default true). When false, do not follow into MINUS_EXPR when
building a PLUS reduction chain; treat MINUS results as leaves.
In vect_slp_check_for_roots, call vect_slp_check_reduction_root once
with allow_alt_code false so that "sum of diffs" (d_i = a[i]-b[i],
sum = d0+...+dN) is recognized and vectorized. Pure PLUS chains
still work; other callers of vect_slp_linearize_chain keep the
default. Once support for MINUS_EXPR in the chain is added, this
call site can be switched to allow_alt_code true.
New test: gcc.dg/vect/bb-slp-sum-of-diffs.c
---
.../gcc.dg/vect/bb-slp-sum-of-diffs.c | 25 +++
gcc/tree-vect-slp.cc | 149 ++++++++++--------
2 files changed, 109 insertions(+), 65 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c
b/gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c
new file mode 100644
index 00000000000..a5f066a2692
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-sum-of-diffs.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3 -fdump-tree-slp2" } */
+
+/* Test that SLP recognizes "sum of diffs" (d_i = a[i]-b[i], sum = d0+...+d7)
+ as a reduction chain and vectorizes the basic block. */
+
+int __attribute__((noipa))
+sum_of_diffs (int *a, int *b)
+{
+ int d0 = a[0] - b[0];
+ int d1 = a[1] - b[1];
+ int d2 = a[2] - b[2];
+ int d3 = a[3] - b[3];
+ int d4 = a[4] - b[4];
+ int d5 = a[5] - b[5];
+ int d6 = a[6] - b[6];
+ int d7 = a[7] - b[7];
+
+ int sum = d0 + d1 + d2 + d3 + d4 + d5 + d6 + d7;
+ return sum;
+}
+
+/* Check that the reduction was SLP-vectorized (REDUC_PLUS appears in the
tree). */
+/* { dg-final { scan-tree-dump "\.REDUC_PLUS" "slp2" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c932e8d5cba..82d1110f3be 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1755,8 +1755,9 @@ dt_sort_cmp (const void *op1_, const void *op2_, void *)
associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
filling CHAIN with the result and using WORKLIST as intermediate storage.
CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
- or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
- stmts, starting with START. */
+ or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
+ stmts, starting with START. When ALLOW_ALT_CODE is false, do not
+ follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf). */
static void
vect_slp_linearize_chain (vec_info *vinfo,
@@ -1764,7 +1765,8 @@ vect_slp_linearize_chain (vec_info *vinfo,
vec<chain_op_t> &chain,
enum tree_code code, gimple *start,
gimple *&code_stmt, gimple
*&alt_code_stmt,
- vec<gimple *> *chain_stmts)
+ vec<gimple *> *chain_stmts,
+ bool allow_alt_code = true)
{
/* For each lane linearize the addition/subtraction (or other
uniform associatable operation) expression tree. */
@@ -1800,7 +1802,8 @@ vect_slp_linearize_chain (vec_info *vinfo,
&& single_imm_use (op, &use_p, &use_stmt)
&& is_gimple_assign (def_stmt_info->stmt)
&& (gimple_assign_rhs_code (def_stmt_info->stmt) == code
- || (code == PLUS_EXPR
+ || (allow_alt_code
+ && code == PLUS_EXPR
&& (gimple_assign_rhs_code
(def_stmt_info->stmt)
== MINUS_EXPR))))
{
@@ -9874,6 +9877,75 @@ vect_slp_is_lane_insert (gimple *use_stmt, tree vec,
unsigned *this_lane)
return true;
}
+/* Check if ASSIGN is the end of a valid reduction chain and, if so, register
+ it as a reduction root in BB_VINFO->roots. Linearize with ALLOW_ALT_CODE
+ (when false, do not follow into MINUS when building a PLUS chain). Return
+ true if a root was registered, false otherwise. */
+
+static bool
+vect_slp_check_reduction_root (bb_vec_info bb_vinfo,
+ vec<std::pair<tree_code,
gimple *> > &worklist,
+ vec<chain_op_t> &chain,
+ enum tree_code code, gimple
*assign,
+ gimple *&code_stmt, gimple
*&alt_code_stmt,
+ vec<gimple *> *chain_stmts,
+ bool allow_alt_code)
+{
+ vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
+ code_stmt, alt_code_stmt,
chain_stmts,
+ allow_alt_code);
+ if (chain.length () <= 1)
+ return false;
+ chain.sort (dt_sort_cmp, bb_vinfo);
+ /* ??? Now we'd want to strip externals and constants
+ but record those to be handled in the epilogue. */
+ /* ??? For now do not allow mixing ops or externs/constants. */
+ bool invalid = false;
+ unsigned remain_cnt = 0;
+ unsigned last_idx = 0;
+ for (unsigned i = 0; i < chain.length (); ++i)
+ {
+ if (chain[i].code != code)
+ {
+ invalid = true;
+ break;
+ }
+ if (chain[i].dt != vect_internal_def
+ || (gimple_get_lhs (bb_vinfo->lookup_def (chain[i].op)->stmt)
+ != chain[i].op))
+ remain_cnt++;
+ else
+ last_idx = i;
+ }
+ if ((chain.length () - remain_cnt) & 1)
+ remain_cnt++;
+ if (invalid || chain.length () - remain_cnt <= 1)
+ return false;
+ vec<stmt_vec_info> stmts;
+ vec<tree> remain = vNULL;
+ stmts.create (chain.length ());
+ if (remain_cnt > 0)
+ remain.create (remain_cnt);
+ for (unsigned i = 0; i < chain.length (); ++i)
+ {
+ stmt_vec_info stmt_info;
+ if (chain[i].dt == vect_internal_def
+ && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
+ gimple_get_lhs (stmt_info->stmt) == chain[i].op)
+ && (i != last_idx || (stmts.length () & 1)))
+ stmts.quick_push (stmt_info);
+ else
+ remain.quick_push (chain[i].op);
+ }
+ vec<stmt_vec_info> roots;
+ roots.create (chain_stmts->length ());
+ for (unsigned i = 0; i < chain_stmts->length (); ++i)
+ roots.quick_push (bb_vinfo->lookup_stmt ((*chain_stmts)[i]));
+ bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
+ stmts, roots,
remain));
+ return true;
+}
+
/* Find any vectorizable constructors and add them to the grouped_store
array. */
@@ -10046,67 +10118,14 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo)
if (!reduction_fn_for_scalar_code (code, &reduc_fn)
|| reduc_fn == IFN_LAST)
continue;
- vect_slp_linearize_chain (bb_vinfo, worklist, chain, code,
assign,
- /* ??? */
- code_stmt,
alt_code_stmt, &chain_stmts);
- if (chain.length () > 1)
- {
- /* Sort the chain according to def_type and operation. */
- chain.sort (dt_sort_cmp, bb_vinfo);
- /* ??? Now we'd want to strip externals and constants
- but record those to be handled in the epilogue. */
- /* ??? For now do not allow mixing ops or
externs/constants. */
- bool invalid = false;
- unsigned remain_cnt = 0;
- unsigned last_idx = 0;
- for (unsigned i = 0; i < chain.length (); ++i)
- {
- if (chain[i].code != code)
- {
- invalid = true;
- break;
- }
- if (chain[i].dt != vect_internal_def
- /* Avoid stmts where the def is not the LHS,
like
- ASMs. */
- || (gimple_get_lhs (bb_vinfo->lookup_def
-
(chain[i].op)->stmt)
- != chain[i].op))
- remain_cnt++;
- else
- last_idx = i;
- }
- /* Make sure to have an even number of lanes as we later do
- all-or-nothing discovery, not trying to split
further. */
- if ((chain.length () - remain_cnt) & 1)
- remain_cnt++;
- if (!invalid && chain.length () - remain_cnt > 1)
- {
- vec<stmt_vec_info> stmts;
- vec<tree> remain = vNULL;
- stmts.create (chain.length ());
- if (remain_cnt > 0)
- remain.create (remain_cnt);
- for (unsigned i = 0; i < chain.length (); ++i)
- {
- stmt_vec_info stmt_info;
- if (chain[i].dt == vect_internal_def
- && ((stmt_info =
bb_vinfo->lookup_def (chain[i].op)),
- gimple_get_lhs
(stmt_info->stmt) == chain[i].op)
- && (i != last_idx
- || (stmts.length () & 1)))
- stmts.quick_push (stmt_info);
- else
- remain.quick_push (chain[i].op);
- }
- vec<stmt_vec_info> roots;
- roots.create (chain_stmts.length ());
- for (unsigned i = 0; i < chain_stmts.length ();
++i)
- roots.quick_push (bb_vinfo->lookup_stmt
(chain_stmts[i]));
- bb_vinfo->roots.safe_push (slp_root
(slp_inst_kind_bb_reduc,
-
stmts, roots, remain));
- }
- }
+ /* Build chain with allow_alt_code false: do not follow
MINUS_EXPR
+ when building a PLUS chain (treat MINUS as leaf). This
recognizes
+ sum-of-diffs (e.g. d_i = a_i - b_i, sum = d0 + d1 + ...).
Once
+ support for mixing MINUS_EXPR in the chain is added, this can
be
+ called with allow_alt_code true. */
+ vect_slp_check_reduction_root (bb_vinfo, worklist, chain, code,
+ assign,
code_stmt, alt_code_stmt,
+
&chain_stmts, false);
}
}
}
--
2.34.1