The following makes sure to cost live scalar stmts appearing in multiple
SLP nodes only once and code-generate them from the SLP node we verified
we can replace all scalar uses from.

        * tree-vectorizer.h (_slp_tree::live_lanes): New vector.
        (SLP_TREE_LIVE_LANES): New.
        * tree-vect-loop.cc (vectorizable_live_operation): Append
        to SLP_TREE_LIVE_LANES.
        * tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize
        SLP_TREE_LIVE_LANES.
        (_slp_tree::~_slp_tree): Release SLP_TREE_LIVE_LANES.
        (vect_print_slp_tree): Adjust live lane dumping, indicating
        the SLP node a lane is code generated from.
        (vect_bb_slp_mark_live_stmts): No longer verify we can
        code-generate from all SLP nodes but at least one, picking
        the first.
        * tree-vect-stmts.cc (vect_transform_stmt): Iterate over
        SLP_TREE_LIVE_LANES.
        (vect_analyze_stmt): Also analyze reductions for live
        lanes.
---
 gcc/tree-vect-loop.cc  | 11 ++++++++---
 gcc/tree-vect-slp.cc   | 31 +++++++++++++++----------------
 gcc/tree-vect-stmts.cc | 12 ++++++++----
 gcc/tree-vectorizer.h  |  5 +++++
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5518be4d392..dd239a0b015 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10165,7 +10165,10 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
   if (vect_is_reduction (slp_node))
     {
       if (!vec_stmt_p)
-       return true;
+       {
+         SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
+         return true;
+       }
       /* For SLP reductions we vectorize the epilogue for all involved stmts
         together.  For SLP reduction chains we only get here once.  */
       if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
@@ -10289,6 +10292,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
       if (!loop_vinfo)
        record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
                          0, vect_epilogue);
+      SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
       return true;
     }
 
@@ -10423,7 +10427,8 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
            /* ???  This can happen when the live lane ends up being
               rooted in a vector construction code-generated by an
               external SLP node (and code-generation for that already
-              happened).  See gcc.dg/vect/bb-slp-47.c.
+              happened).  See gcc.dg/vect/bb-slp-47.c or
+              gcc.dg/vect/pr97173.c or gcc.dg/vect/bb-slp-pr115777.c.
               Doing this is what would happen if that vector CTOR
               were not code-generated yet so it is not too bad.
               ???  In fact we'd likely want to avoid this situation
@@ -10444,7 +10449,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
            /* ???  It can also happen that we end up pulling a def into
               a loop where replacing out-of-loop uses would require
               a new LC SSA PHI node.  Retain the original scalar in
-              those cases as well.  PR98064.  */
+              those cases as well.  PR98064, gcc.dg/vect/bb-slp-57.c.  */
            if (TREE_CODE (new_tree) == SSA_NAME
                && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
                && (gimple_bb (use_stmt)->loop_father
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index e4b3352f958..10e9ff607ad 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -114,6 +114,7 @@ _slp_tree::_slp_tree ()
   slp_first_node = this;
   SLP_TREE_SCALAR_STMTS (this) = vNULL;
   SLP_TREE_SCALAR_OPS (this) = vNULL;
+  SLP_TREE_LIVE_LANES (this) = vNULL;
   SLP_TREE_VEC_DEFS (this) = vNULL;
   SLP_TREE_CHILDREN (this) = vNULL;
   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
@@ -149,6 +150,7 @@ _slp_tree::~_slp_tree ()
   SLP_TREE_CHILDREN (this).release ();
   SLP_TREE_SCALAR_STMTS (this).release ();
   SLP_TREE_SCALAR_OPS (this).release ();
+  SLP_TREE_LIVE_LANES (this).release ();
   SLP_TREE_VEC_DEFS (this).release ();
   SLP_TREE_LOAD_PERMUTATION (this).release ();
   SLP_TREE_LANE_PERMUTATION (this).release ();
@@ -3340,7 +3342,9 @@ vect_print_slp_tree (dump_flags_t dump_kind, 
dump_location_t loc,
     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
       if (stmt_info)
        dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
-                        STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
+                        SLP_TREE_LIVE_LANES (node).contains (i)
+                        ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
+                                    ? "[l] " : ""),
                         i, stmt_info->stmt);
       else
        dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
@@ -9023,28 +9027,23 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, 
slp_tree node,
              }
          if (live_p && can_insert)
            {
+             /* Only record a live stmt when we can replace all uses.  We
+                record from which SLP tree we vectorize the uses, so we'll
+                cost once and can deal with the case that not all SLP nodes
+                may be suitable for code-generation of all live uses.
+                ???  But we never split up the work between multiple SLP
+                nodes.  */
              STMT_VINFO_LIVE_P (stmt_info) = true;
-             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
-                                              instance, i, false, cost_vec))
+             if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
+                                               instance, i, false, cost_vec))
                {
-                 /* ???  So we know we can vectorize the live stmt from one SLP
-                    node.  If we cannot do so from all or none consistently
-                    we'd have to record which SLP node (and lane) we want to
-                    use for the live operation.  So make sure we can
-                    code-generate from all nodes.  */
-                 /* ???  We are costing the extract possibly multiple times,
-                    but code-generation also works this way, leaving uses
-                    that are not valid for one extraction to be handled
-                    by another.  */
+                 STMT_VINFO_LIVE_P (stmt_info) = false;
                  mark_visited = false;
                }
            }
        }
       if (mark_visited)
-       {
-         STMT_VINFO_LIVE_P (stmt_info) = false;
-         svisited.add (stmt_info);
-       }
+       svisited.add (stmt_info);
     }
 
   slp_tree child;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 2c3214cc196..c895e143473 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13316,7 +13316,6 @@ vect_analyze_stmt (vec_info *vinfo,
   /* Stmts that are (also) "live" (i.e. - that are used out of the loop)
       need extra handling, except for vectorizable reductions.  */
   if (!bb_vinfo
-      && SLP_TREE_TYPE (node) != reduc_vec_info_type
       && (SLP_TREE_TYPE (node) != lc_phi_info_type
          || SLP_TREE_DEF_TYPE (node) == vect_internal_def)
       && (!node->ldst_lanes || SLP_TREE_PERMUTE_P (node))
@@ -13466,9 +13465,14 @@ vect_transform_stmt (vec_info *vinfo,
     {
       /* Handle stmts whose DEF is used outside the loop-nest that is
         being vectorized.  */
-      done = can_vectorize_live_stmts (vinfo, slp_node,
-                                      slp_node_instance, true, NULL);
-      gcc_assert (done);
+      for (unsigned lane : SLP_TREE_LIVE_LANES (slp_node))
+       {
+         stmt_vec_info slp_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[lane];
+         done = vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
+                                             slp_node_instance, lane,
+                                             true, NULL);
+         gcc_assert (done);
+       }
     }
 
   return is_store;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9a4126e0cec..e2aa1c4bad5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -328,6 +328,10 @@ struct _slp_tree {
   vec<stmt_vec_info> stmts;
   /* A group of scalar operands to be vectorized together.  */
   vec<tree> ops;
+  /* A set of lane indices that are live and to be code-generated from
+     this SLP node.  */
+  vec<unsigned> live_lanes;
+
   /* The representative that should be used for analysis and
      code generation.  */
   stmt_vec_info representative;
@@ -457,6 +461,7 @@ public:
 #define SLP_TREE_CHILDREN(S)                     (S)->children
 #define SLP_TREE_SCALAR_STMTS(S)                 (S)->stmts
 #define SLP_TREE_SCALAR_OPS(S)                   (S)->ops
+#define SLP_TREE_LIVE_LANES(S)                  (S)->live_lanes
 #define SLP_TREE_REF_COUNT(S)                    (S)->refcnt
 #define SLP_TREE_VEC_DEFS(S)                     (S)->vec_defs
 #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
-- 
2.51.0

Reply via email to