The following refactors the vectorizer vector_costs target API to add a new vector_costs::add_vector_cost entry which groups all individual sub-stmts we create per "vector stmt", aka SLP node. This allows for the targets to more easily match on complex cases like emulated gather/scatter or even just vector construction.
The patch itself is just a prototype and leaves out BB vectorization for simplicity. It also does not fully group all vector stmts but leaves some bare add_stmt_cost hook invocations. I'd expect the add_stmt_hook to be still used for scalar stmt costing and for costing added branching around prologue/epilogue. The default implementation of add_vector_cost just dispatches to add_stmt_cost for individual stmts. Eventually the actual data we track for the combined costing will diverge (no need to track SLP node or stmt_info there?), so targets would eventually be expected to implement both hooks and splice out common workers to deal with "missing" information coming in from the different entries. This should eventually baby-step us towards the generic vectorizer code being able to compute and compare latency and resource utilization throughout the scalar / vector loop iteration based on latency and throughput data determined on a stmt-by-stmt base from the target. As given the grouping should be an incremental improvement, but I have not tried to see how it can simplify the x86 hook implementation - I've been triggered by the aarch64 reported bootstrap fail on the cleanup RFC I posted given that code wants to identify a scalar load that's costed as part of a gather/scatter operation. Any comments or problems you forsee? Thanks, Richard. * tree-vectorizer.h (vector_costs::add_vector_cost): New method. (_slp_tree::cost_vec): New. * tree-vectorizer.cc (vector_costs::add_vector_cost): Add fallback implementation. * tree-vect-stmts.cc (vect_analyze_stmt): For loop vectorization record costs into the SLP node specific cost vector. * tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize cost_vec. (_slp_tree::~_slp_tree): Release cost_vec. (vect_slp_add_node_cost): New. (vect_slp_analyze_operations): Cost the stmt groups recorded per SLP node for loop vectorization. --- gcc/tree-vect-slp.cc | 27 +++++++++++++++++++++++++++ gcc/tree-vect-stmts.cc | 32 +++++++++++++++++--------------- gcc/tree-vectorizer.cc | 12 ++++++++++++ gcc/tree-vectorizer.h | 6 ++++++ 4 files changed, 62 insertions(+), 15 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 8d0a612577b..5c112800087 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -127,6 +127,7 @@ _slp_tree::_slp_tree () SLP_TREE_REPRESENTATIVE (this) = NULL; SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT; SLP_TREE_REF_COUNT (this) = 1; + this->cost_vec = vNULL; this->failed = NULL; this->max_nunits = 1; this->lanes = 0; @@ -149,6 +150,7 @@ _slp_tree::~_slp_tree () SLP_TREE_LOAD_PERMUTATION (this).release (); SLP_TREE_LANE_PERMUTATION (this).release (); SLP_TREE_SIMD_CLONE_INFO (this).release (); + this->cost_vec.release (); if (this->failed) free (failed); } @@ -8499,6 +8501,23 @@ vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots, vect_slp_prune_covered_roots (child, roots, visited); } +/* Cost vectorization of NODE and children recursively. */ + +static void +vect_slp_add_node_cost (vector_costs *vector_costs, slp_tree node, + hash_set<slp_tree> &visited) +{ + if (visited.add (node)) + return; + + for (slp_tree child : SLP_TREE_CHILDREN (node)) + if (child) + vect_slp_add_node_cost (vector_costs, child, visited); + + if (node->cost_vec.exists ()) + vector_costs->add_vector_cost (node, &node->cost_vec); +} + /* Analyze statements in SLP instances of VINFO. Return true if the operations are supported. */ @@ -8582,6 +8601,14 @@ vect_slp_analyze_operations (vec_info *vinfo) } } + if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo)) + { + visited.empty (); + for (auto instance : loop_vinfo->slp_instances) + vect_slp_add_node_cost (loop_vinfo->vector_costs, + SLP_INSTANCE_TREE (instance), visited); + } + /* Now look for SLP instances with a root that are covered by other instances and remove them. */ hash_set<stmt_vec_info> roots; diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 8f38d8bcb7c..19c29402068 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -14006,6 +14006,8 @@ vect_analyze_stmt (vec_info *vinfo, return opt_result::failure_at (stmt_info->stmt, "needs non-SLP handling\n"); + gcc_assert (node->cost_vec.is_empty ()); + ok = true; if (!bb_vinfo && (STMT_VINFO_RELEVANT_P (stmt_info) @@ -14013,34 +14015,34 @@ vect_analyze_stmt (vec_info *vinfo, /* Prefer vectorizable_call over vectorizable_simd_clone_call so -mveclibabi= takes preference over library functions with the simd attribute. */ - ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, cost_vec) + ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, &node->cost_vec) || vectorizable_simd_clone_call (vinfo, stmt_info, NULL, NULL, node, - cost_vec) + &node->cost_vec) || vectorizable_conversion (vinfo, stmt_info, - NULL, NULL, node, cost_vec) + NULL, NULL, node, &node->cost_vec) || vectorizable_operation (vinfo, stmt_info, - NULL, NULL, node, cost_vec) + NULL, NULL, node, &node->cost_vec) || vectorizable_assignment (vinfo, stmt_info, - NULL, NULL, node, cost_vec) - || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec) - || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec) + NULL, NULL, node, &node->cost_vec) + || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, &node->cost_vec) + || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, &node->cost_vec) || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo), - stmt_info, node, cost_vec) + stmt_info, node, &node->cost_vec) || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info, - node, node_instance, cost_vec) + node, node_instance, &node->cost_vec) || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info, - NULL, node, cost_vec) - || vectorizable_shift (vinfo, stmt_info, NULL, NULL, node, cost_vec) + NULL, node, &node->cost_vec) + || vectorizable_shift (vinfo, stmt_info, NULL, NULL, node, &node->cost_vec) || vectorizable_condition (vinfo, stmt_info, - NULL, NULL, node, cost_vec) + NULL, NULL, node, &node->cost_vec) || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node, - cost_vec) + &node->cost_vec) || vectorizable_lc_phi (as_a <loop_vec_info> (vinfo), stmt_info, node) || vectorizable_recurr (as_a <loop_vec_info> (vinfo), - stmt_info, NULL, node, cost_vec) + stmt_info, NULL, node, &node->cost_vec) || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node, - cost_vec)); + &node->cost_vec)); else { if (bb_vinfo) diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index 447f882c518..083ab46728c 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1844,6 +1844,18 @@ vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* See the comment above the declaration for details. */ +unsigned int +vector_costs::add_vector_cost (slp_tree, stmt_vector_for_cost *cost_vec) +{ + unsigned int sum = 0; + for (auto item : *cost_vec) + sum += add_stmt_cost (item.count, item.kind, item.stmt_info, item.node, + item.vectype, item.misalign, item.where); + return sum; +} + +/* See the comment above the declaration for details. */ + void vector_costs::finish_cost (const vector_costs *) { diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 990072fca95..26fee63961e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -243,6 +243,8 @@ struct _slp_tree { for linear arguments (pair of NULLs for other arguments). */ vec<tree> simd_clone_info; + stmt_vector_for_cost cost_vec; + tree vectype; /* Vectorized defs. */ vec<tree> vec_defs; @@ -1665,6 +1667,10 @@ public: tree vectype, int misalign, vect_cost_model_location where); + /* Update the costs in response to generating vector code for NODE + with the stmt parts described by COST_VEC. */ + virtual unsigned int add_vector_cost (slp_tree node, stmt_vector_for_cost *); + /* Finish calculating the cost of the code. The results can be read back using the functions below. -- 2.43.0