Hi, I have attached a "quick and dirty" prototype patch (var-partition-1.diff), that attempts to partition variables to reduce number of external references and to increase usage of section-anchors to CSE address computation of global variables.
We could put a variable in a partition that has max references for it, however it doesn't lend itself directly to section anchor optimization. For instance if a partition has max references for variables 'a' and 'b', but no function in that partition references both 'a', and 'b' then AFAIU it doesn't make any difference from section anchors perspective to have them in same partition. The patch tries to assign a set of variables (>= 2) to a partition whose functions have maximum references for that set. Functions within the partition that reference the variables in the set can take advantage of section-anchors. Functions referencing the variables in the set outside the partition would need to load them as external references (using movw/movt), however since we are placing the set in partition that has maximal references for it, number of external references should be overall reduced. Partitioning is gated by -flto-var-partition and enabled only for arm and aarch64. As per previous discussion [1], I haven't touched function partitioning. Does this approach look ok especially regarding correctness ? So far, I have cross-tested patch on arm*-*-*, aarch64*-*-*. I haven't yet managed to benchmark the patch. As a cheap measurement, I tried to measure number of external references with and without patch by writing a small ipa pass which is run during ltrans and simply walks over varpool nodes and counts number of varpool_nodes for which DECL_EXTERNAL (vnode->decl) is true and vnode->definition is 0. Is that sufficient condition to determine if variable is externally defined ? I have attached the pass (count-external-refs.diff) and the comparison done with it for for SPEC2000 [2]. The entries in "before" and "after" column contain summation of number of external refs (total_count) across all partitions before and after applying the patch. Does the comparison hold any merit ? I was wondering if we could we use a better way for measuring statically the effects of variable partitioning ? I hope also to get done with benchmarking soon. I have not yet figured out how to integrate it with existing cost metrics for balanced partitioning, I am looking into that. I would be grateful for suggestions on the patch. [1] https://gcc.gnu.org/ml/gcc/2016-04/msg00090.html [2] SPEC2000 comparison: https://docs.google.com/spreadsheets/d/1xnszyw04ksoyBspmCVYesq6KARLw-PA2n3T4aoaKdYw/edit?usp=sharing
diff --git a/gcc/common.opt b/gcc/common.opt index f0d7196..e67e39e 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1699,6 +1699,10 @@ flto-partition= Common Joined RejectNegative Enum(lto_partition_model) Var(flag_lto_partition) Init(LTO_PARTITION_BALANCED) Specify the algorithm to partition symbols and vars at linktime. +flto-partition-var +Common Report Var(flag_lto_var_partition) Init(0) +Partition variables to take advantage of section anchors. + ; The initial value of -1 comes from Z_DEFAULT_COMPRESSION in zlib.h. flto-compression-level= Common Joined RejectNegative UInteger Var(flag_lto_compression_level) Init(-1) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a0db3a4..dc33196 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -8252,6 +8252,7 @@ aarch64_override_options (void) aarch64_register_fma_steering (); + flag_lto_var_partition = 1; } /* Implement targetm.override_options_after_change. */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 3503c15..92c12f5 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3458,6 +3458,8 @@ arm_option_override (void) /* Init initial mode for testing. */ thumb_flipper = TARGET_THUMB; + + flag_lto_var_partition = 1; } static void diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c index 453343a..3d1df48 100644 --- a/gcc/lto/lto-partition.c +++ b/gcc/lto/lto-partition.c @@ -34,6 +34,11 @@ along with GCC; see the file COPYING3. If not see #include "ipa-prop.h" #include "ipa-inline.h" #include "lto-partition.h" +#include <map> +#include <set> +#include <algorithm> +#include <vector> +#include <algorithm> vec<ltrans_partition> ltrans_partitions; @@ -407,6 +412,274 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition) add_symbol_to_partition (partition, node); } +/* FIXME: Currently I don't care to compute power set if set has more than 4 vars, which + I chose as an arbitrary upper limit and should be analyzed with experimenting. + Or maybe fall back to computing power-set for n in general way, but that can be + very expensive if n is large. */ + +static void +compute_powerset (std::vector<varpool_node *>& set, std::vector< std::vector< varpool_node * > >& pset) +{ + if (set.size () > 4) + { + pset.push_back (set); + return; + } + + std::vector<varpool_node *> v; + +#define add_2_elems(x, y) \ + v.clear (); \ + v.push_back (set[x]); \ + v.push_back (set[y]); \ + +#define add_2_elems_to_pset(x, y) \ + add_2_elems(x, y) \ + pset.push_back (v); + +#define add_3_elems(x, y, z) \ + add_2_elems(x, y) \ + v.push_back (set[z]); \ + +#define add_3_elems_to_pset(x, y, z) \ + add_3_elems(x, y, z) \ + pset.push_back (v); + +#define add_4_elems(x, y, z, w) \ + add_3_elems(x, y, z) \ + v.push_back (set[z]); + +#define add_4_elems_to_pset(x, y, z, w) \ + add_4_elems(x, y, z, w) \ + pset.push_back (v); + + switch (set.size ()) + { + case 4: + add_2_elems_to_pset(0, 3) + add_2_elems_to_pset(1, 3) + add_2_elems_to_pset(2, 3) + add_3_elems_to_pset(0, 1, 3) + add_3_elems_to_pset(1, 2, 3) + add_4_elems_to_pset(0, 1, 2, 3) + /* fallthru. */ + + case 3: + add_2_elems_to_pset(0, 2) + add_2_elems_to_pset(1, 2) + add_3_elems_to_pset(0, 1, 2) + /* fallthru. */ + + case 2: + add_2_elems_to_pset(0, 1) + } + +} + +struct global_map_value { + unsigned partition_index; + unsigned count; +}; + +typedef std::pair< std::vector< varpool_node * >, global_map_value > global_map_pair; +typedef std::map< std::vector< varpool_node * >, global_map_value > global_map_t; + +static void +dump_global_pair (global_map_t::iterator it) +{ + const std::vector< varpool_node * >& v = it->first; + global_map_value gv = it->second; + + fprintf (stderr, "\n{"); + for (unsigned i = 0; i < v.size (); i++) + fprintf (stderr, "%s, ", v[i]->name ()); + fprintf (stderr, "} occuring %u times is assigned to %u partition\n", gv.count, gv.partition_index); +} + +static bool +global_pair_cmp (const global_map_t::iterator& it_1, const global_map_t::iterator& it_2) +{ + // compare for descending order + const std::vector< varpool_node *>& v1 = it_1->first; + global_map_value gv1 = it_1->second; + + const std::vector< varpool_node *>& v2 = it_2->first; + global_map_value gv2 = it_2->second; + + if (gv1.count > gv2.count) + return true; + + else if (gv1.count == gv2.count) + return v1.size () > v2.size (); + + else + return false; +} + +/* The function tries to assign a set of variables to partition that references it most. + The intent is that variables in this set would then be anchored and their address + computations can be CSE'd by using the anchor. + + The function performs 3 steps: + a) Assign set of variables used in a function to partition if that partition has maximum + references for the set. We do not consider set of variables used across functions, + for eg, if a is used in f1() and b in f2() then do not consider {a, b}. Only consider + sets of variables which are referenced in a single function. + So if f() references {a, b, c}, then consider + each combination involving at-least 2 vars: {a, b}, {a, c}, {b, c} and {a, b, c} and count + references for each of these sets in the partition. + global_map[set] == partition that has max references for the set. + If partition 0 hax max references (5) for {a, b} then global_map[{a, b}] = <0, 5> + where 0 is the partition_index and 5 is the number of references. global_map_value + is used to represent the tuple <partition_index, number_of_references>. + + b) Sort global_map in descending order according to count. + This is done so that if sets have overlapping variables then + we choose first to allocate the one which occurs more number of times. + For eg, if set {a, b} occurs 5 times in partition 0 and {a, b, c} occurs + 2 times in partition 1, then we first want to consider allocating + {a, b} to partition 0. + + c) Step 3: Try allocating variables to set in the partition. + This step might not add all variables from the set in the partition since + they could have been allocated in another partition. + + TODO: Integrate with cost-metrics for balanced partitioning. + + Possible improvements: + a) If a set has same number occurence in multiple partitions, choose a partition + whose functions have more repeated number of occurence of that set. + for eg: + partition 0: f1() uses {a, b} 3 times. + partition 1: f2(), f3(), f4() use {a, b} 1 time each. + In this case alocating {a, b} to partition 0 instead of partiton 1 would be more beneficial. + However this might not always be a win. + + b) Currently we choose a set with higher priority, whose occurence is strictly greater. + This might not always be optimal. + For instance if in a partition {a, b} occurs 'x' times and {a, b, c} occurs 'y' times, + and y < x but very close to x, it probably makes more sense to first consider the set {a, b, c}. + Perhaps define a range [x - c, x] where c is constant and choose allocation of set with higher + number of elements, whose number of occurences is y if y falls within [x -c, x]. + + c) hot/cold functions: We might want to ignore occurence of set in cold and noreturn functions and give + a higher priority to set that occurs in a hot function. +*/ + +static void +assign_vars_to_partitions (void) +{ +#define ASSIGN_VARS_TO_PARTITIONS_DEBUG + + global_map_t global_map; + + /* Step 1: Assign set to partition that has maxmium references for the set. */ + + for (unsigned i = 0; i < ltrans_partitions.length (); i++) + { + /* partition_map: map of set -> number of times it's referenced in the partition. */ + typedef std::map< std::vector< varpool_node * >, unsigned > partition_map_t; + partition_map_t partition_map; + + const ltrans_partition& part = ltrans_partitions[i]; + for (unsigned j = 0; j < lto_symtab_encoder_size (part->encoder); j++) + { + cgraph_node *cnode = dyn_cast<cgraph_node *> (lto_symtab_encoder_deref (part->encoder, j)); + if (!cnode) + continue; + + /* var_map: map of variable -> number of times it's referenced in the function. */ + std::map<varpool_node *, unsigned> var_map; + ipa_ref *ref; + + for (unsigned k = 0; cnode->iterate_reference (k, ref); k++) + if (varpool_node *vnode = dyn_cast<varpool_node *> (ref->referred)) + var_map[vnode]++; + + /* vars: set of variables referenced by function. */ + std::vector< varpool_node * > vars; + for (std::map<varpool_node *, unsigned>::const_iterator it = var_map.begin (); it != var_map.end (); ++it) + vars.push_back (it->first); + + /* If a function refers to only one variable, there's no point anchoring it. */ + if (vars.size () == 1) + continue; + + /* Compute the power set for vars and for each set within the power_set, count + occurences of the set in function. + The occurence of set == occurence of variable, which has least occurence + for instance occurence{a, b} == min (nrefs(a), nrefs(b)). + number of references for individual variables are already computed in var_map. */ + + std::vector< std::vector< varpool_node * > > vars_pset; + compute_powerset (vars, vars_pset); + + for (unsigned i = 0; i < vars_pset.size (); ++i) + { + unsigned min = UINT_MAX; + const std::vector< varpool_node * >& v = vars_pset[i]; + + for (unsigned i = 0; i < v.size (); i++) + if (var_map[v[i]] < min) + min = var_map[v[i]]; + partition_map[v] += min; + } + } + + /* Walk each set that we computed occurences for in this partition. If a set S has max occurences + in current partition, then update global_map[S] .count = partition_map[S], .partition_index = i; */ + + for (partition_map_t::iterator it = partition_map.begin (); it != partition_map.end (); ++it) + { + const std::vector<varpool_node *>& v = it->first; + unsigned count = it->second; + + global_map_value& gv = global_map[v]; + if (count > gv.count) + { + gv.partition_index = i; + gv.count = count; + } + } + } + + /* Step 2: We are not really sorting global_map, rather initially store all + global_map_t::iterators in keys vector, and then sort keys in + descending order. global_pair_cmp is used for comparison. */ + + std::vector< global_map_t::iterator > keys; + + for (global_map_t::iterator it = global_map.begin (); it != global_map.end (); ++it) + keys.push_back (it); + + std::sort (keys.begin (), keys.end (), global_pair_cmp); + + /* Step 3: Walk each iterator in keys, and attempt to put variables from set + in the respective partition. */ + + for (unsigned i = 0; i < keys.size (); i++) + { + global_map_t::iterator it = keys[i]; + const std::vector< varpool_node * >& v = it->first; + global_map_value gv = it->second; + + for (unsigned j = 0; j < v.size (); j++) + { + varpool_node *vnode = v[j]; + + if (!vnode->definition) + continue; + + if (!symbol_partitioned_p (vnode) && flag_toplevel_reorder + && !vnode->no_reorder + && vnode->get_partitioning_class () == SYMBOL_PARTITION) + { + // fprintf (stderr, "%s goes to partition %u\n", vnode->name (), gv.partition_index); + add_symbol_to_partition (ltrans_partitions[gv.partition_index], vnode); + } + } + } +} /* Group cgraph nodes into equally-sized partitions. @@ -628,6 +901,9 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) for (j = 0; refs_node->iterate_reference (j, ref); j++) if (is_a <varpool_node *> (ref->referred)) { + if (flag_lto_var_partition) + continue; + int index; vnode = dyn_cast <varpool_node *> (ref->referred); @@ -663,6 +939,9 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) for (j = 0; refs_node->iterate_referring (j, ref); j++) if (is_a <varpool_node *> (ref->referring)) { + if (flag_lto_var_partition) + continue; + int index; vnode = dyn_cast <varpool_node *> (ref->referring); @@ -763,6 +1042,8 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) } next_nodes.truncate (0); + if (flag_lto_var_partition) + assign_vars_to_partitions (); /* Varables that are not reachable from the code go into last partition. */ if (flag_toplevel_reorder)
diff --git a/gcc/ipa.c b/gcc/ipa.c index 6722d3b..56e7ae9 100644 --- a/gcc/ipa.c +++ b/gcc/ipa.c @@ -1445,3 +1445,82 @@ make_pass_ipa_single_use (gcc::context *ctxt) { return new pass_ipa_single_use (ctxt); } + +static void +execute_ipa_external_ref (void) +{ + unsigned total_count = 0; + unsigned readonly_count = 0; + unsigned hot_count = 0; + unsigned cold_count = 0; + unsigned noreturn_count = 0; + + varpool_node *vnode; + FOR_EACH_VARIABLE (vnode) + { + if (vnode->definition) + continue; + if (!DECL_EXTERNAL (vnode->decl)) + continue; + + ipa_ref *ref; + for (unsigned i = 0; vnode->iterate_referring (i, ref); i++) + { + total_count++; +#if 0 + if (TREE_READONLY (vnode->decl)) + readonly_count++; + if (is_a<cgraph_node *> (ref->referring)) + { + if (lookup_attribute ("hot", DECL_ATTRIBUTES (ref->referring->decl))) + hot_count++; + if (lookup_attribute ("cold", DECL_ATTRIBUTES (ref->referring->decl))) + cold_count++; + if (lookup_attribute ("noreturn", DECL_ATTRIBUTES (ref->referring->decl))) + noreturn_count++; + } +#endif + } + + } + fprintf (stderr, "total_count = %u\n", total_count); +} + +namespace { + +const pass_data pass_data_ipa_external_ref = +{ + SIMPLE_IPA_PASS, /* type */ + "external-ref", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_ipa_external_ref : public simple_ipa_opt_pass +{ +public: + pass_ipa_external_ref (gcc::context *ctxt) + : simple_ipa_opt_pass (pass_data_ipa_external_ref, ctxt) + {} + + /* opt_pass methods: */ + virtual unsigned int execute (function *) + { + execute_ipa_external_ref (); + return 0; + } + +}; // class pass_data_ipa_external_ref + +} // anon namespace + +simple_ipa_opt_pass * +make_pass_ipa_external_ref (gcc::context *ctxt) +{ + return new pass_ipa_external_ref (ctxt); +} diff --git a/gcc/passes.def b/gcc/passes.def index 3647e90..46c0f3e 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -170,6 +170,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_ipa_pta); NEXT_PASS (pass_dispatcher_calls); NEXT_PASS (pass_omp_simd_clone); + NEXT_PASS (pass_ipa_external_ref); TERMINATE_PASS_LIST (all_late_ipa_passes) /* These passes are run after IPA passes on every function that is being diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 36299a6..e3309b3 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -500,6 +500,7 @@ extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt); extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt); extern simple_ipa_opt_pass *make_pass_dispatcher_calls (gcc::context *ctxt); extern simple_ipa_opt_pass *make_pass_omp_simd_clone (gcc::context *ctxt); +extern simple_ipa_opt_pass *make_pass_ipa_external_ref (gcc::context *ctxt); extern ipa_opt_pass_d *make_pass_ipa_profile (gcc::context *ctxt); extern ipa_opt_pass_d *make_pass_ipa_cdtor_merge (gcc::context *ctxt); extern ipa_opt_pass_d *make_pass_ipa_single_use (gcc::context *ctxt);