* Future work
This patch is an RFC, so the current implementation is conservative. It
can potentially be extended to support more lane-wise operations and
more loop patterns such as reductions.
This patch is bootstrapped and regression-tested on x86_64-linux-gnu,
arm-linux-gnueabihf and aarch64-linux-gnu. Additional test cases will
be added later in a formal patch if this overall direction is agreed.
Comments and suggestions on this would be appreciated.
gcc/ChangeLog:
* tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize
skip_rev_perm to false.
(vect_print_slp_tree): Dump skip_rev_perm flag.
(vect_count_slp_node_refs): New function to count SLP node
references across SLP instances.
(vect_can_skip_reverse_perm_p): New function to check whether
reverse permutations in each SLP instance can be skipped.
(vect_mark_skip_reverse_perm): New function to mark the nodes
whose reverse permutations can be skipped.
(vect_optimize_slp): Apply reverse permutation elimination.
* tree-vect-stmts.cc (vect_simple_negative_step_p): New function
to check negative-step contiguous DRs.
(vect_lanewise_code_p): New function to check lane-wise code.
(vectorizable_store): Skip generating reverse permutations when
skip_rev_perm is set.
(vectorizable_load): Likewise.
* tree-vectorizer.h (struct _slp_tree): Add skip_rev_perm field.
(SLP_TREE_SKIP_REV_PERM): New macro.
(vect_simple_negative_step_p): Declare.
(vect_lanewise_code_p): Declare.
---
gcc/tree-vect-slp.cc | 118 +++++++++++++++++++++++++++++++++++++++++
gcc/tree-vect-stmts.cc | 64 +++++++++++++++++++++-
gcc/tree-vectorizer.h | 6 +++
3 files changed, 186 insertions(+), 2 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c932e8d5cba..fd87b83f516 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -119,6 +119,7 @@ _slp_tree::_slp_tree ()
SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
SLP_TREE_LANE_PERMUTATION (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
+ SLP_TREE_SKIP_REV_PERM (this) = false;
SLP_TREE_CODE (this) = ERROR_MARK;
SLP_TREE_GS_SCALE (this) = 0;
SLP_TREE_GS_BASE (this) = NULL_TREE;
@@ -3320,6 +3321,8 @@ vect_print_slp_tree (dump_flags_t dump_kind,
dump_location_t loc,
dump_printf (dump_kind, " }%s\n",
node->ldst_lanes ? " (load-lanes)" : "");
}
+ if (SLP_TREE_SKIP_REV_PERM (node))
+ dump_printf_loc (metadata, user_loc, "\treverse permutation skipped\n");
if (SLP_TREE_CHILDREN (node).is_empty ())
return;
dump_printf_loc (metadata, user_loc, "\tchildren");
@@ -8310,6 +8313,115 @@ vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t
*bst_map, slp_tree& node)
*bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
}
+/* Traverse the SLP tree rooted at NODE and increment the reference count of
+ each node in REFCNTS. When called once per SLP instance, REFCNTS records
+ how many distinct instances reference each node. VISITED ensures nodes are
+ counted at most once per instance. */
+
+static void
+vect_count_slp_node_refs (slp_tree node, hash_set<slp_tree> &visited,
+ hash_map<slp_tree, unsigned> &refcnts)
+{
+ if (!node || visited.add (node))
+ return;
+
+ unsigned *cntp = refcnts.get (node);
+ refcnts.put (node, cntp ? (*cntp + 1) : 1);
+
+ for (slp_tree child : SLP_TREE_CHILDREN (node))
+ vect_count_slp_node_refs (child, visited, refcnts);
+}
+
+/* Return true if the SLP instance rooted at NODE can safely skip reverse
+ permutations for its negative-step contiguous DRs. REFCNTS is used to
+ reject nodes shared across SLP instances. CANDIDATES is populated with
+ corresponding DR nodes. */
+
+static bool
+vect_can_skip_reverse_perm_p (loop_vec_info loop_vinfo, slp_tree node,
+ hash_map<slp_tree, unsigned> &refcnts,
+ auto_vec<slp_tree> &candidates)
+{
+ if (!node)
+ return false;
+
+ /* Constant or external defs are acceptable if they are uniform vectors,
+ since uniform vectors stay unchanged after reversal. */
+ vect_def_type def_type = SLP_TREE_DEF_TYPE (node);
+ if (def_type == vect_constant_def
+ || def_type == vect_external_def)
+ return vect_slp_tree_uniform_p (node);
+
+ /* Require internal defs and reject nodes shared across SLP instances. */
+ if (def_type != vect_internal_def
+ || refcnts.get_or_insert (node, NULL) > 1)
+ return false;
+
+ /* Only handle gimple assign statements for now. */
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
+ if (!stmt_info || !is_gimple_assign (stmt_info->stmt))
+ return false;
+
+ /* Any use outside the loop means the vector value escapes, so skipping the
+ reverse permutation could change the lane order observed by that use. */
+ tree stmt_def = gimple_assign_lhs (stmt_info->stmt);
+ if (TREE_CODE (stmt_def) == SSA_NAME)
+ {
+ imm_use_iterator imm_iter;
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, stmt_def)
+ if (!flow_bb_inside_loop_p (loop_vinfo->loop,
+ gimple_bb (USE_STMT (use_p))))
+ return false;
+ }
+
+ if (STMT_VINFO_DATA_REF (stmt_info))
+ {
+ if (vect_simple_negative_step_p (loop_vinfo, stmt_info))
+ candidates.safe_push (node);
+ else
+ return false;
+ }
+ else if (!vect_lanewise_code_p (gimple_assign_rhs_code (stmt_info->stmt)))
+ return false;
+
+ for (slp_tree child : SLP_TREE_CHILDREN (node))
+ if (!vect_can_skip_reverse_perm_p (loop_vinfo, child, refcnts, candidates))
+ return false;
+
+ return true;
+}
+
+/* Mark SLP nodes whose reverse permutations can be skipped. */
+
+static void
+vect_mark_skip_reverse_perm (loop_vec_info loop_vinfo)
+{
+ /* Count how many instances reference each SLP tree node. */
+ hash_map<slp_tree, unsigned> refcnts;
+ for (auto inst : loop_vinfo->slp_instances)
+ {
+ hash_set<slp_tree> visited;
+ vect_count_slp_node_refs (SLP_INSTANCE_TREE (inst), visited, refcnts);
+ }
+
+ /* Iterate again to find SLP instances where reverse permutations inside can
+ be safely skipped and mark the corresponding nodes. */
+ for (auto inst : loop_vinfo->slp_instances)
+ {
+ slp_tree root = SLP_INSTANCE_TREE (inst);
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root);
+ if (!stmt_info
+ || !STMT_VINFO_DATA_REF (stmt_info))
+ continue;
+
+ auto_vec<slp_tree> candidates;
+ if (vect_can_skip_reverse_perm_p (loop_vinfo, root, refcnts, candidates))
+ for (slp_tree node : candidates)
+ SLP_TREE_SKIP_REV_PERM (node) = true;
+ }
+}
+
/* Optimize the SLP graph of VINFO. */
void
@@ -8327,6 +8439,12 @@ vect_optimize_slp (vec_info *vinfo)
vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
release_scalar_stmts_to_slp_tree_map (bst_map);
+
+ /* Apply reverse permutation elimination. This is restricted to loop SLP as
+ it's required that no vector value escapes from the SLP graph. */
+ loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
+ if (loop_vinfo)
+ vect_mark_skip_reverse_perm (loop_vinfo);
}
/* Gather loads reachable from the individual SLP graph entries. */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 22285250aa8..db3a550c301 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1859,6 +1859,64 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info
stmt_info)
size_zero_node);
}
+/* STMT_INFO is a load or store. Return true if it is a contiguous DR with a
+ negative step in an innermost loop. Strided, gather/scatter, grouped, and
+ partial-vector accesses are excluded. */
+
+bool
+vect_simple_negative_step_p (loop_vec_info loop_vinfo, stmt_vec_info stmt_info)
+{
+ if (loop_vinfo->loop->inner
+ || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ return false;
+
+ if (STMT_VINFO_STRIDED_P (stmt_info)
+ || STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ || STMT_VINFO_GROUPED_ACCESS (stmt_info))
+ return false;
+
+ return compare_step_with_zero (loop_vinfo, stmt_info) < 0;
+}
+
+/* Return true for a conservative subset of tree codes that are lane-wise. */
+
+bool
+vect_lanewise_code_p (tree_code code)
+{
+ switch (code)
+ {
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ case MULT_EXPR:
+ case NEGATE_EXPR:
+ case MIN_EXPR:
+ case MAX_EXPR:
+ case ABS_EXPR:
+ case LSHIFT_EXPR:
+ case RSHIFT_EXPR:
+ case LROTATE_EXPR:
+ case RROTATE_EXPR:
+ case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ case BIT_AND_EXPR:
+ case BIT_NOT_EXPR:
+ case LT_EXPR:
+ case LE_EXPR:
+ case GT_EXPR:
+ case GE_EXPR:
+ case EQ_EXPR:
+ case NE_EXPR:
+ case UNLT_EXPR:
+ case UNLE_EXPR:
+ case UNGT_EXPR:
+ case UNGE_EXPR:
+ case UNEQ_EXPR:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* If the target supports a permute mask that reverses the elements in
a vector of type VECTYPE, return that mask, otherwise return null. */
@@ -9323,7 +9381,8 @@ vectorizable_store (vec_info *vinfo,
if (!costing_p)
vec_oprnd = vec_oprnds[i];
- if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+ if (memory_access_type == VMAT_CONTIGUOUS_REVERSE
+ && !SLP_TREE_SKIP_REV_PERM (slp_node))
{
if (costing_p)
inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
@@ -11844,7 +11903,8 @@ vectorizable_load (vec_info *vinfo,
}
}
- if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+ if (memory_access_type == VMAT_CONTIGUOUS_REVERSE
+ && !SLP_TREE_SKIP_REV_PERM (slp_node))
{
if (costing_p)
inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bde164301a8..d18a296748a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -358,6 +358,9 @@ struct _slp_tree {
poly_uint64 max_nunits;
/* The DEF type of this node. */
enum vect_def_type def_type;
+ /* Whether the reverse permutation associated with a contiguous negative-step
+ DR can be eliminated. */
+ bool skip_rev_perm;
/* The number of scalar lanes produced by this node. */
unsigned int lanes;
/* The operation of this node. */
@@ -462,6 +465,7 @@ public:
#define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation
#define SLP_TREE_LANE_PERMUTATION(S) (S)->lane_permutation
#define SLP_TREE_DEF_TYPE(S) (S)->def_type
+#define SLP_TREE_SKIP_REV_PERM(S) (S)->skip_rev_perm
#define SLP_TREE_VECTYPE(S) (S)->vectype
#define SLP_TREE_REPRESENTATIVE(S) (S)->representative
#define SLP_TREE_LANES(S) (S)->lanes
@@ -2522,6 +2526,8 @@ extern bool supportable_indirect_convert_operation
(code_helper,
tree = NULL_TREE,
slp_tree = NULL);
extern int compare_step_with_zero (vec_info *, stmt_vec_info);
+extern bool vect_simple_negative_step_p (loop_vec_info, stmt_vec_info);
+extern bool vect_lanewise_code_p (tree_code);
extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
enum vect_cost_for_stmt, stmt_vec_info,