Hi,
I have attached a "quick and dirty" prototype patch (var-partition-1.diff),
that attempts to partition variables to reduce number of
external references and to increase usage of section-anchors
to CSE address computation of global variables.

We could put a variable in a partition that has max references for it,
however it doesn't lend itself directly to section anchor optimization.
For instance if a partition has max references for variables 'a' and 'b',
but no function in that partition references both 'a', and 'b' then AFAIU
it doesn't make any difference from section anchors perspective to have them
in same partition.

The patch tries to assign a set of variables (>= 2)
to a partition whose functions have maximum references for that set.
Functions within the partition that reference the variables
in the set can take advantage of section-anchors. Functions
referencing the variables in the set outside the partition
would need to load them as external references (using movw/movt),
however since we are placing the set in partition that has maximal
references for it, number of external references should be overall
reduced.

Partitioning is gated by -flto-var-partition and enabled
only for arm and aarch64. As per previous discussion [1], I haven't
touched function partitioning. Does this approach look ok
especially regarding correctness ?
So far, I have cross-tested patch on arm*-*-*, aarch64*-*-*.

I haven't yet managed to benchmark the patch.
As a cheap measurement, I tried to measure number of external
references with and without patch by writing a small ipa pass
which is run during ltrans and simply walks over varpool nodes
and counts number of varpool_nodes for which DECL_EXTERNAL (vnode->decl) is true
and vnode->definition is 0. Is that sufficient condition to determine
if variable is externally defined ? I have attached the pass
(count-external-refs.diff)
and the comparison done with it for for SPEC2000 [2]. The entries
in "before" and "after" column contain summation of number of
external refs (total_count) across all partitions before and after applying
the patch. Does the comparison hold any merit ?
I was wondering if we could we use a better way for
measuring statically the effects of variable partitioning ?
I hope also to get done with benchmarking soon.

I have not yet figured out how to integrate it with existing cost metrics for
balanced partitioning, I am looking into that.
I would be grateful for suggestions on the patch.

[1] https://gcc.gnu.org/ml/gcc/2016-04/msg00090.html

[2] SPEC2000 comparison:
https://docs.google.com/spreadsheets/d/1xnszyw04ksoyBspmCVYesq6KARLw-PA2n3T4aoaKdYw/edit?usp=sharing
diff --git a/gcc/common.opt b/gcc/common.opt
index f0d7196..e67e39e 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1699,6 +1699,10 @@ flto-partition=
 Common Joined RejectNegative Enum(lto_partition_model) Var(flag_lto_partition) 
Init(LTO_PARTITION_BALANCED)
 Specify the algorithm to partition symbols and vars at linktime.
 
+flto-partition-var
+Common Report Var(flag_lto_var_partition) Init(0)
+Partition variables to take advantage of section anchors.
+
 ; The initial value of -1 comes from Z_DEFAULT_COMPRESSION in zlib.h.
 flto-compression-level=
 Common Joined RejectNegative UInteger Var(flag_lto_compression_level) Init(-1)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a0db3a4..dc33196 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8252,6 +8252,7 @@ aarch64_override_options (void)
 
   aarch64_register_fma_steering ();
 
+  flag_lto_var_partition = 1;
 }
 
 /* Implement targetm.override_options_after_change.  */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3503c15..92c12f5 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3458,6 +3458,8 @@ arm_option_override (void)
 
   /* Init initial mode for testing.  */
   thumb_flipper = TARGET_THUMB;
+
+  flag_lto_var_partition = 1;
 }
 
 static void
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 453343a..3d1df48 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -34,6 +34,11 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-prop.h"
 #include "ipa-inline.h"
 #include "lto-partition.h"
+#include <map>
+#include <set>
+#include <algorithm>
+#include <vector>
+#include <algorithm>
 
 vec<ltrans_partition> ltrans_partitions;
 
@@ -407,6 +412,274 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, 
ltrans_partition partition)
       add_symbol_to_partition (partition, node);
 }
 
+/* FIXME: Currently I don't care to compute power set if set has more than 4 
vars, which
+   I chose as an arbitrary upper limit and should be analyzed with 
experimenting.
+   Or maybe fall back to computing power-set for n in general way, but that 
can be
+   very expensive if n is large.  */
+
+static void
+compute_powerset (std::vector<varpool_node *>& set, std::vector< std::vector< 
varpool_node * > >& pset)
+{
+  if (set.size () > 4)
+    {
+      pset.push_back (set);
+      return;
+    }
+  
+  std::vector<varpool_node *> v;
+
+#define add_2_elems(x, y)  \
+  v.clear ();  \
+  v.push_back (set[x]);    \
+  v.push_back (set[y]);    \
+
+#define add_2_elems_to_pset(x, y) \
+  add_2_elems(x, y) \
+  pset.push_back (v);
+
+#define add_3_elems(x, y, z)  \
+  add_2_elems(x, y)  \
+  v.push_back (set[z]); \
+
+#define add_3_elems_to_pset(x, y, z) \
+  add_3_elems(x, y, z) \
+  pset.push_back (v);
+
+#define add_4_elems(x, y, z, w)  \
+  add_3_elems(x, y, z) \
+  v.push_back (set[z]); 
+
+#define add_4_elems_to_pset(x, y, z, w) \
+  add_4_elems(x, y, z, w) \
+  pset.push_back (v);
+
+  switch (set.size ()) 
+    {
+      case 4:
+       add_2_elems_to_pset(0, 3)
+       add_2_elems_to_pset(1, 3)
+       add_2_elems_to_pset(2, 3)
+       add_3_elems_to_pset(0, 1, 3)
+       add_3_elems_to_pset(1, 2, 3)
+       add_4_elems_to_pset(0, 1, 2, 3)
+       /* fallthru.  */
+      
+      case 3:
+       add_2_elems_to_pset(0, 2)
+       add_2_elems_to_pset(1, 2)
+       add_3_elems_to_pset(0, 1, 2)
+       /* fallthru.  */
+
+      case 2:
+        add_2_elems_to_pset(0, 1)      
+    }
+
+}
+
+struct global_map_value {
+  unsigned partition_index;
+  unsigned count;
+};
+
+typedef std::pair< std::vector< varpool_node * >, global_map_value > 
global_map_pair;
+typedef std::map< std::vector< varpool_node * >, global_map_value > 
global_map_t;
+
+static void
+dump_global_pair (global_map_t::iterator it)
+{
+  const std::vector< varpool_node * >& v = it->first;
+  global_map_value gv = it->second;
+
+  fprintf (stderr, "\n{");
+  for (unsigned i = 0; i < v.size (); i++)
+    fprintf (stderr, "%s, ", v[i]->name ());
+  fprintf (stderr, "} occuring %u times is assigned to %u partition\n", 
gv.count, gv.partition_index);
+}
+
+static bool
+global_pair_cmp (const global_map_t::iterator& it_1, const 
global_map_t::iterator& it_2)
+{
+  // compare for descending order
+  const std::vector< varpool_node *>& v1 = it_1->first;
+  global_map_value gv1 = it_1->second;
+
+  const std::vector< varpool_node *>& v2 = it_2->first;
+  global_map_value gv2 = it_2->second;
+
+  if (gv1.count > gv2.count)
+    return true;
+
+  else if (gv1.count == gv2.count)
+    return v1.size () > v2.size (); 
+
+  else
+    return false;
+}
+
+/*  The function tries to assign a set of variables to partition that 
references it most.
+    The intent is that variables in this set would then be anchored and their 
address
+    computations can be CSE'd by using the anchor.
+
+    The function performs 3 steps:
+    a) Assign set of variables used in a function to partition if that 
partition has maximum
+       references for the set. We do not consider set of variables used across 
functions,
+       for eg, if a is used in f1() and b in f2() then do not consider {a, b}. 
Only consider
+       sets of variables which are referenced in a single function.
+       So if f() references {a, b, c}, then consider
+       each combination involving at-least 2 vars: {a, b}, {a, c}, {b, c} and 
{a, b, c} and count
+       references for each of these sets in the partition.
+       global_map[set] == partition that has max references for the set.
+       If partition 0 hax max references (5) for {a, b} then global_map[{a, 
b}] = <0, 5>
+       where 0 is the partition_index and 5 is the number of references. 
global_map_value
+       is used to represent the tuple <partition_index, number_of_references>.
+
+    b) Sort global_map in descending order according to  count.
+       This is done so that if sets have overlapping variables then
+       we choose first to allocate the one which occurs more number of times.
+       For eg, if set {a, b} occurs 5 times in partition 0 and {a, b, c} occurs
+       2 times in partition 1, then we first want to consider allocating
+       {a, b} to partition 0.
+
+    c) Step 3: Try allocating variables to set in the partition.
+       This step might not add all variables from the set in the partition 
since
+       they could have been allocated in another partition. 
+
+   TODO: Integrate with cost-metrics for balanced partitioning.
+
+   Possible improvements:
+   a) If a set has same number occurence in multiple partitions, choose a 
partition
+      whose functions have more repeated number of occurence of that set. 
+      for eg:
+      partition 0: f1() uses {a, b} 3 times.
+      partition 1: f2(), f3(), f4() use {a, b} 1 time each.
+      In this case alocating {a, b} to partition 0 instead of partiton 1 would 
be more beneficial.
+      However this might not always be a win.
+
+   b) Currently we choose a set with higher priority, whose occurence is 
strictly greater.
+      This might not always be optimal.
+      For instance if in a  partition {a, b} occurs 'x' times and {a, b, c} 
occurs 'y' times,
+      and y < x but very close to x, it probably makes more sense to first 
consider the set {a, b, c}.
+      Perhaps define a range [x - c, x] where c is constant and choose 
allocation of set with higher
+      number of elements, whose number of occurences is y if y falls within [x 
-c, x].
+
+   c) hot/cold functions: We might want to ignore occurence of set in cold and 
noreturn functions and give
+      a higher priority to set that occurs in a hot function.
+*/
+
+static void
+assign_vars_to_partitions (void)
+{
+#define ASSIGN_VARS_TO_PARTITIONS_DEBUG
+
+  global_map_t global_map;
+
+  /* Step 1: Assign set to partition that has maxmium references for the set.  
*/
+
+  for (unsigned i = 0; i < ltrans_partitions.length (); i++)
+    {
+      /* partition_map: map of set -> number of times it's referenced in the 
partition.  */ 
+      typedef std::map< std::vector< varpool_node * >, unsigned > 
partition_map_t;
+      partition_map_t partition_map;
+
+      const ltrans_partition& part = ltrans_partitions[i];
+      for (unsigned j = 0; j < lto_symtab_encoder_size (part->encoder); j++)
+       {
+         cgraph_node *cnode = dyn_cast<cgraph_node *> 
(lto_symtab_encoder_deref (part->encoder, j));
+         if (!cnode)
+           continue;
+
+         /* var_map: map of variable -> number of times it's referenced in the 
function.  */
+         std::map<varpool_node *, unsigned> var_map;
+         ipa_ref *ref;   
+
+         for (unsigned k = 0; cnode->iterate_reference (k, ref); k++)
+           if (varpool_node *vnode = dyn_cast<varpool_node *> (ref->referred))
+             var_map[vnode]++;
+
+         /* vars: set of variables referenced by function.  */
+         std::vector< varpool_node * > vars; 
+         for (std::map<varpool_node *, unsigned>::const_iterator it = 
var_map.begin (); it != var_map.end (); ++it)
+           vars.push_back (it->first);
+
+         /* If a function refers to only one variable, there's no point 
anchoring it.  */ 
+         if (vars.size () == 1)
+           continue;
+
+         /* Compute the power set for vars and for each set within the 
power_set, count
+            occurences of the set in function.
+             The occurence of set == occurence of variable, which has least 
occurence
+             for instance occurence{a, b} == min (nrefs(a), nrefs(b)).
+            number of references for individual variables are already computed 
in var_map.  */
+
+         std::vector< std::vector< varpool_node * > > vars_pset; 
+         compute_powerset (vars, vars_pset);
+
+         for (unsigned i = 0; i < vars_pset.size (); ++i)
+           {
+             unsigned min = UINT_MAX;
+             const std::vector< varpool_node * >& v = vars_pset[i];
+
+             for (unsigned i = 0; i < v.size (); i++)
+               if (var_map[v[i]] < min)
+                 min = var_map[v[i]];
+             partition_map[v] += min;
+           }
+       }
+
+      /* Walk each set that we computed occurences for in this partition. If a 
set S has max occurences
+        in current partition, then update global_map[S] .count = 
partition_map[S], .partition_index = i;  */
+    
+      for (partition_map_t::iterator it = partition_map.begin (); it != 
partition_map.end (); ++it)
+       {
+         const std::vector<varpool_node *>& v = it->first;
+         unsigned count = it->second;
+
+         global_map_value& gv = global_map[v];
+         if (count > gv.count)
+           {
+             gv.partition_index = i; 
+             gv.count = count;
+           }
+       }
+    }
+
+  /* Step 2: We are not really sorting global_map, rather initially store all
+     global_map_t::iterators in keys vector, and then sort keys in
+     descending order. global_pair_cmp is used for comparison.  */ 
+
+  std::vector< global_map_t::iterator > keys;
+
+  for (global_map_t::iterator it = global_map.begin (); it != global_map.end 
(); ++it)
+    keys.push_back (it);
+  
+  std::sort (keys.begin (), keys.end (), global_pair_cmp);
+  
+  /* Step 3: Walk each iterator in keys, and attempt to put variables from set
+     in the respective partition.  */
+
+  for (unsigned i = 0; i < keys.size (); i++)
+    {
+      global_map_t::iterator it = keys[i];
+      const std::vector< varpool_node * >& v = it->first;
+      global_map_value gv = it->second;
+
+      for (unsigned j = 0; j < v.size (); j++)
+       {
+         varpool_node *vnode = v[j];
+
+         if (!vnode->definition)
+           continue;
+
+         if (!symbol_partitioned_p (vnode) && flag_toplevel_reorder
+             && !vnode->no_reorder
+             && vnode->get_partitioning_class () == SYMBOL_PARTITION)
+           {
+         //    fprintf (stderr, "%s goes to partition %u\n", vnode->name (), 
gv.partition_index);
+             add_symbol_to_partition (ltrans_partitions[gv.partition_index], 
vnode);
+           }
+       }
+    }
+}
 
 /* Group cgraph nodes into equally-sized partitions.
 
@@ -628,6 +901,9 @@ lto_balanced_map (int n_lto_partitions, int 
max_partition_size)
          for (j = 0; refs_node->iterate_reference (j, ref); j++)
            if (is_a <varpool_node *> (ref->referred))
              {
+               if (flag_lto_var_partition)
+                 continue;
+
                int index;
 
                vnode = dyn_cast <varpool_node *> (ref->referred);
@@ -663,6 +939,9 @@ lto_balanced_map (int n_lto_partitions, int 
max_partition_size)
          for (j = 0; refs_node->iterate_referring (j, ref); j++)
            if (is_a <varpool_node *> (ref->referring))
              {
+               if (flag_lto_var_partition)
+                 continue;
+
                int index;
 
                vnode = dyn_cast <varpool_node *> (ref->referring);
@@ -763,6 +1042,8 @@ lto_balanced_map (int n_lto_partitions, int 
max_partition_size)
     }
 
   next_nodes.truncate (0);
+  if (flag_lto_var_partition)
+    assign_vars_to_partitions ();
 
   /* Varables that are not reachable from the code go into last partition.  */
   if (flag_toplevel_reorder)
diff --git a/gcc/ipa.c b/gcc/ipa.c
index 6722d3b..56e7ae9 100644
--- a/gcc/ipa.c
+++ b/gcc/ipa.c
@@ -1445,3 +1445,82 @@ make_pass_ipa_single_use (gcc::context *ctxt)
 {
   return new pass_ipa_single_use (ctxt);
 }
+
+static void
+execute_ipa_external_ref (void)
+{
+  unsigned total_count = 0;
+  unsigned readonly_count = 0;
+  unsigned hot_count = 0;
+  unsigned cold_count = 0;
+  unsigned noreturn_count = 0;
+
+  varpool_node *vnode;
+  FOR_EACH_VARIABLE (vnode)
+    {
+      if (vnode->definition)
+       continue;
+      if (!DECL_EXTERNAL (vnode->decl))
+       continue;
+
+      ipa_ref *ref;
+      for (unsigned i = 0; vnode->iterate_referring (i, ref); i++)
+       {
+         total_count++;
+#if 0
+         if (TREE_READONLY (vnode->decl))
+           readonly_count++;
+         if (is_a<cgraph_node *> (ref->referring))
+           {
+             if (lookup_attribute ("hot", DECL_ATTRIBUTES 
(ref->referring->decl)))
+               hot_count++;
+             if (lookup_attribute ("cold", DECL_ATTRIBUTES 
(ref->referring->decl)))
+               cold_count++;
+             if (lookup_attribute ("noreturn", DECL_ATTRIBUTES 
(ref->referring->decl)))
+               noreturn_count++;
+          }
+#endif
+       }
+      
+    }
+  fprintf (stderr, "total_count = %u\n", total_count);
+}
+
+namespace {
+
+const pass_data pass_data_ipa_external_ref =
+{
+  SIMPLE_IPA_PASS, /* type */
+  "external-ref", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_ipa_external_ref : public simple_ipa_opt_pass
+{
+public:
+  pass_ipa_external_ref (gcc::context *ctxt)
+    : simple_ipa_opt_pass (pass_data_ipa_external_ref, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual unsigned int execute (function *)
+    {
+      execute_ipa_external_ref ();
+      return 0;
+    }
+
+}; // class pass_data_ipa_external_ref
+
+} // anon namespace
+
+simple_ipa_opt_pass *
+make_pass_ipa_external_ref (gcc::context *ctxt)
+{
+  return new pass_ipa_external_ref (ctxt);
+}
diff --git a/gcc/passes.def b/gcc/passes.def
index 3647e90..46c0f3e 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -170,6 +170,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_ipa_pta);
   NEXT_PASS (pass_dispatcher_calls);
   NEXT_PASS (pass_omp_simd_clone);
+  NEXT_PASS (pass_ipa_external_ref);
   TERMINATE_PASS_LIST (all_late_ipa_passes)
 
   /* These passes are run after IPA passes on every function that is being
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 36299a6..e3309b3 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -500,6 +500,7 @@ extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context 
*ctxt);
 extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_dispatcher_calls (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_omp_simd_clone (gcc::context *ctxt);
+extern simple_ipa_opt_pass *make_pass_ipa_external_ref (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_profile (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_cdtor_merge (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_single_use (gcc::context *ctxt);

Reply via email to