The following improves the all-or-nothing discovery of reduction
groups to consider sub-groups by trying toplevel "matches" candidates
for this.  For simplicity and to limit compile-time failed sub-group
matches are not decomposed further, only the originally failed part
is tried again to discover more sub-groups.  Any remaining fails
get picked up by the current single-reduction handling.

Such opportunities happen in hot spots of the ARM Adaptive Scalabe
Texture Compression Encoder (ASTC) where I ran into them, I'm not sure
to what extent VLA vector ISAs benefit from SLP reductions though,
the encoder has up to 32 concurrent reductions running.  It probably
most benefits RVV, PowerPC and s390 as there's no HW mapping for its C++
vector implementation there.  Of course the software path should also
benefit on x86.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Any opinions?

Thanks,
Richard.

        PR tree-optimization/122722
        * tree-vect-slp.cc (vect_analyze_slp_reductions): New
        function, split out from vect_analyze_slp.  Try SLP
        sub-groups.
        (vect_analyze_slp_reduction_group): New helper.

        * gcc.dg/vect/slp-reduc-14.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/slp-reduc-14.c |  15 ++
 gcc/tree-vect-slp.cc                     | 234 +++++++++++++++++------
 2 files changed, 191 insertions(+), 58 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-14.c

diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c 
b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
new file mode 100644
index 00000000000..7966d23716a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+
+void foo (int * __restrict sums, int *a, int *b, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      sums[0] = sums[0] + a[2*i];
+      sums[1] = sums[1] + a[2*i+1];
+      sums[2] = sums[2] + b[2*i];
+      sums[3] = sums[3] + b[2*i+1];
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "SLP discovery of size 2 reduction group" 
2 "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 07e22ea7ccf..3e403e7b1ea 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4723,6 +4723,179 @@ vect_analyze_slp_reduction (loop_vec_info vinfo,
   return false;
 }
 
+/* Analyze a single SLP reduction group.  If successful add a SLP instance
+   for it and return true, otherwise return false and have *MATCHES
+   populated.  */
+
+static bool
+vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
+                                 vec<stmt_vec_info> scalar_stmts,
+                                 scalar_stmts_to_slp_tree_map_t *bst_map,
+                                 unsigned max_tree_size, unsigned *limit,
+                                 bool *matches)
+{
+  /* Try to form a reduction group.  */
+  unsigned int group_size = scalar_stmts.length ();
+  if (!matches)
+    matches = XALLOCAVEC (bool, group_size);
+  poly_uint64 max_nunits = 1;
+  unsigned tree_size = 0;
+  slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
+                                      group_size,
+                                      &max_nunits, matches, limit,
+                                      &tree_size, bst_map);
+  if (!node)
+    return false;
+
+  /* Create a new SLP instance.  */
+  slp_instance new_instance = XNEW (class _slp_instance);
+  SLP_INSTANCE_TREE (new_instance) = node;
+  SLP_INSTANCE_LOADS (new_instance) = vNULL;
+  SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
+  SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
+  SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
+  new_instance->reduc_phis = NULL;
+  new_instance->cost_vec = vNULL;
+  new_instance->subgraph_entries = vNULL;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "SLP size %u vs. limit %u.\n",
+                    tree_size, max_tree_size);
+
+  loop_vinfo->slp_instances.safe_push (new_instance);
+
+  /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+     the number of scalar stmts in the root in a few places.
+     Verify that assumption holds.  */
+  gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+             .length () == group_size);
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "SLP discovery of size %d reduction group "
+                      "succeeded\n", group_size);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "Final SLP tree for instance %p:\n",
+                      (void *) new_instance);
+      vect_print_slp_graph (MSG_NOTE, vect_location,
+                           SLP_INSTANCE_TREE (new_instance));
+    }
+
+  return true;
+}
+
+/* Analyze reductions in LOOP_VINFO and populate SLP instances
+   accordingly.  Returns false if something fails.  */
+
+static bool
+vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
+                            unsigned max_tree_size, unsigned *limit,
+                            scalar_stmts_to_slp_tree_map_t *bst_map,
+                            bool force_single_lane)
+{
+  if (loop_vinfo->reductions.is_empty ())
+    return true;
+
+  /* Collect reduction statements we can combine into
+     a SLP reduction.  */
+  vec<stmt_vec_info> scalar_stmts;
+  scalar_stmts.create (loop_vinfo->reductions.length ());
+  for (auto next_info : loop_vinfo->reductions)
+    {
+      next_info = vect_stmt_to_vectorize (next_info);
+      if ((STMT_VINFO_RELEVANT_P (next_info)
+          || STMT_VINFO_LIVE_P (next_info))
+         /* ???  Make sure we didn't skip a conversion around a
+            reduction path.  In that case we'd have to reverse
+            engineer that conversion stmt following the chain using
+            reduc_idx and from the PHI using reduc_def.  */
+         && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
+             || (STMT_VINFO_DEF_TYPE (next_info)
+                 == vect_double_reduction_def)))
+       {
+         /* Do not discover SLP reductions combining lane-reducing
+            ops, that will fail later.  */
+         if (!force_single_lane
+             && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
+           scalar_stmts.quick_push (next_info);
+         /* Do SLP discovery for single-lane reductions.  */
+         else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
+                                                max_tree_size, limit,
+                                                bst_map,
+                                                force_single_lane))
+           return false;
+       }
+    }
+
+  if (scalar_stmts.length () > 1)
+    {
+      /* Try to form a reduction group.  */
+      unsigned int group_size = scalar_stmts.length ();
+      bool *matches = XALLOCAVEC (bool, group_size);
+      if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
+                                           max_tree_size, limit, matches))
+       return true;
+
+      /* When analysis as a single SLP reduction group failed try to
+        form sub-groups by collecting matching lanes.  Do not recurse
+        that on failure (to limit compile-time costs), but recurse
+        for the initial non-matching parts.  Everything not covered
+        by a sub-group gets single-reduction treatment.  */
+      vec<stmt_vec_info> cands;
+      cands.create (group_size);
+      while (matches[0])
+       {
+         cands.truncate (0);
+         for (unsigned i = 0; i < group_size; ++i)
+           if (matches[i])
+             cands.quick_push (scalar_stmts[i]);
+
+         /* Try to form a reduction group.  */
+         if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
+                                               max_tree_size, limit, NULL))
+           {
+             cands = vNULL;
+             cands.create (group_size);
+           }
+         else
+           {
+             /* Do SLP discovery for single-lane reductions.  */
+             for (auto stmt_info : cands)
+               if (! vect_analyze_slp_reduction (loop_vinfo,
+                                                 vect_stmt_to_vectorize
+                                                   (stmt_info),
+                                                 max_tree_size, limit,
+                                                 bst_map, force_single_lane))
+                 return false;
+           }
+         unsigned j = 0;
+         for (unsigned i = 0; i < group_size; ++i)
+           if (!matches[i])
+             {
+               scalar_stmts[j] = scalar_stmts[i];
+               ++j;
+             }
+         scalar_stmts.truncate (j);
+         group_size = scalar_stmts.length ();
+         if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
+                                               bst_map, max_tree_size, limit,
+                                               matches))
+           return true;
+       }
+    }
+  /* Do SLP discovery for single-lane reductions.  */
+  for (auto stmt_info : scalar_stmts)
+    if (! vect_analyze_slp_reduction (loop_vinfo,
+                                     vect_stmt_to_vectorize (stmt_info),
+                                     max_tree_size, limit,
+                                     bst_map, force_single_lane))
+      return false;
+
+  return true;
+}
+
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
    vect_build_slp_tree to build a tree of packed stmts if possible.
    Return FALSE if it's impossible to SLP any stmt in the group.  */
@@ -5617,64 +5790,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size,
   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
     {
       /* Find SLP sequences starting from groups of reductions.  */
-      if (loop_vinfo->reductions.length () > 0)
-       {
-         /* Collect reduction statements we can combine into
-            a SLP reduction.  */
-         vec<stmt_vec_info> scalar_stmts;
-         scalar_stmts.create (loop_vinfo->reductions.length ());
-         for (auto next_info : loop_vinfo->reductions)
-           {
-             next_info = vect_stmt_to_vectorize (next_info);
-             if ((STMT_VINFO_RELEVANT_P (next_info)
-                  || STMT_VINFO_LIVE_P (next_info))
-                 /* ???  Make sure we didn't skip a conversion around a
-                    reduction path.  In that case we'd have to reverse
-                    engineer that conversion stmt following the chain using
-                    reduc_idx and from the PHI using reduc_def.  */
-                 && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
-                     || (STMT_VINFO_DEF_TYPE (next_info)
-                         == vect_double_reduction_def)))
-               {
-                 /* Do not discover SLP reductions combining lane-reducing
-                    ops, that will fail later.  */
-                 if (!force_single_lane
-                     && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
-                   scalar_stmts.quick_push (next_info);
-                 /* Do SLP discovery for single-lane reductions.  */
-                 else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
-                                                        max_tree_size, &limit,
-                                                        bst_map,
-                                                        force_single_lane))
-                   return opt_result::failure_at (vect_location,
-                                                  "SLP build failed.\n");
-               }
-           }
-         /* Save for re-processing on failure.  */
-         vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
-         vec<stmt_vec_info> roots = vNULL;
-         vec<tree> remain = vNULL;
-         if (scalar_stmts.length () <= 1
-             || !vect_build_slp_instance (loop_vinfo,
-                                          slp_inst_kind_reduc_group,
-                                          scalar_stmts, roots, remain,
-                                          max_tree_size, &limit, bst_map,
-                                          force_single_lane))
-           {
-             if (scalar_stmts.length () <= 1)
-               scalar_stmts.release ();
-             /* Do SLP discovery for single-lane reductions.  */
-             for (auto stmt_info : saved_stmts)
-               if (! vect_analyze_slp_reduction (loop_vinfo,
-                                                 vect_stmt_to_vectorize
-                                                   (stmt_info),
-                                                 max_tree_size, &limit,
-                                                 bst_map, force_single_lane))
-                 return opt_result::failure_at (vect_location,
-                                                "SLP build failed.\n");
-           }
-         saved_stmts.release ();
-       }
+      if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
+                                       bst_map, force_single_lane))
+       return opt_result::failure_at (vect_location, "SLP build failed.\n");
 
       /* Make sure to vectorize only-live stmts, usually inductions.  */
       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
-- 
2.51.0

Reply via email to