Hi Honza,
The attached patch implements the partitioning changes by grouping profiled
functions (having non-zero tp_first_run) together in partition_by_tp_first_run.
With balanced partitioning + partition_by_tp_first_run, for workload I tested
on, all the profiled symbols (with non-zero tp_first_run) land in one partition:
tp_first_run_order_pos: 474
But with only balanced partitioning, they get split across 4 different
partitions:
tp_first_run_order_pos: 55
tp_first_run_order_pos: 198
tp_first_run_order_pos: 58
tp_first_run_order_pos: 163
(and overall 10 less partitions are used with balanced partitioning +
partition_by_tp_first_run)
Altho, I didn't see much perf difference between balanced partitioning vs
balanced partitioning + partition_by_tp_first_run,
I am assuming it'd be possibly better wrt code locality if we explicitly
partition all profiled symbols together to ensure they land in single partition
(or as few
partitions as needed) ?
Signed-off-by: Prathamesh Kulkarni <[email protected]>
Thanks,
Prathamesh
Partition functions with non-zero tp_first_run together.
The patch places profiled functions in time profile order together in as
few paritions as possible to get better advantage of code locality.
Unlike PGO, where every instrumented function gets a time profile,
with AutoFDO, the sampled functions are a fraction of the total executed ones.
gcc/ChangeLog:
* ipa-locality-cloning.cc (partition_by_tp_first_run): New function.
(locality_partition_and_clone): Call partition_by_tp_first_run.
gcc/lto/ChangeLog:
* lto-partition.cc (create_partition): Move higher up in the file.
(partition_by_tp_first_run): New function.
(lto_balanced_map): Call partition_by_tp_first_run if -fauto-profile
-fprofile-reorder-functions is passed and noreroder is empty.
Signed-off-by: Prathamesh Kulkarni <[email protected]>
diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc
index 2684046bd2d..20cac3191ea 100644
--- a/gcc/ipa-locality-cloning.cc
+++ b/gcc/ipa-locality-cloning.cc
@@ -994,6 +994,50 @@ locality_determine_static_order (auto_vec<locality_order
*> *order)
return true;
}
+/* Similar to lto-partition.cc:partition_by_timestamp.s */
+
+static locality_partition
+partition_by_tp_first_run (int64_t max_partition_size, int& npartitions)
+{
+ vec<cgraph_node *> cnodes = vNULL;
+ cgraph_node *node;
+ FOR_EACH_DEFINED_FUNCTION (node)
+ if (node->tp_first_run > 0
+ && node->get_partitioning_class () == SYMBOL_PARTITION
+ && ipa_size_summaries->get (node))
+ cnodes.safe_push (node);
+ cnodes.qsort (tp_first_run_node_cmp);
+
+ unsigned n_partitioned_symbols = 0;
+
+ locality_partition partition = NULL;
+ unsigned i;
+ FOR_EACH_VEC_ELT (cnodes, i, node)
+ {
+ if (node_partitioned_p (node))
+ continue;
+ ipa_size_summary *size_summary = ipa_size_summaries->get (node);
+
+ if (partition == NULL
+ || partition->insns + size_summary->size > max_partition_size)
+ partition = create_partition (npartitions);
+
+ add_node_to_partition (partition, node);
+ n_partitioned_symbols++;
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "partition_by_tp_first_run: Total partitions: %d\n",
npartitions);
+ fprintf (dump_file,
+ "partition_by_tp_first_run: Number of partitioned symbols: %d\n",
+ n_partitioned_symbols);
+ }
+
+ return partition;
+}
+
/* Partitioning for code locality.
1. Create and sort callchains. If PGO is available, use real profile
counts. Otherwise, use a set of heuristics to sort the callchains.
@@ -1007,7 +1051,7 @@ locality_partition_and_clone (int
max_locality_partition_size,
lto_locality_cloning_model cloning_model,
int freq_denominator, int size)
{
- locality_partition partition;
+ locality_partition partition = NULL;
int npartitions = 0;
auto_vec<locality_order *> order;
@@ -1040,7 +1084,11 @@ locality_partition_and_clone (int
max_locality_partition_size,
int64_t partition_size
= max_locality_partition_size
? max_locality_partition_size : param_max_partition_size;
- partition = create_partition (npartitions);
+
+ if (flag_auto_profile && flag_profile_reorder_functions)
+ partition = partition_by_tp_first_run (partition_size, npartitions);
+ if (!partition)
+ partition = create_partition (npartitions);
for (unsigned i = 0; i < order.length (); i++)
{
diff --git a/gcc/lto/lto-partition.cc b/gcc/lto/lto-partition.cc
index 435a0d98a7e..a0fdf943ceb 100644
--- a/gcc/lto/lto-partition.cc
+++ b/gcc/lto/lto-partition.cc
@@ -97,6 +97,15 @@ create_partition_if_empty ()
new_partition ("empty");
}
+/* Create and return the created partition of name NAME. */
+
+static ltrans_partition
+create_partition (int &npartitions, const char *name)
+{
+ npartitions++;
+ return new_partition (name);
+}
+
/* Free memory used by ltrans partition.
Encoder can be kept to be freed after streaming. */
static void
@@ -1059,6 +1068,50 @@ lto_cache_map (int n_lto_partitions, int
max_partition_size)
partitioner.apply (partitions, n_lto_partitions);
}
+/* Place symbols that are profiled in as few partitions as possible. */
+
+static ltrans_partition
+partition_by_tp_first_run (int max_partition_size, int &npartitions)
+{
+ vec<cgraph_node *> cnodes = vNULL;
+ cgraph_node *node;
+ FOR_EACH_DEFINED_FUNCTION (node)
+ if (node->tp_first_run > 0
+ && node->get_partitioning_class () == SYMBOL_PARTITION
+ && ipa_size_summaries->get (node))
+ cnodes.safe_push (node);
+ cnodes.qsort (tp_first_run_node_cmp);
+
+ unsigned n_partitioned_symbols = 0;
+
+ ltrans_partition partition = NULL;
+ unsigned i;
+ FOR_EACH_VEC_ELT (cnodes, i, node)
+ {
+ if (symbol_partitioned_p (node))
+ continue;
+
+ ipa_size_summary *size_summary = ipa_size_summaries->get (node);
+ if (partition == NULL
+ || partition->insns + size_summary->size > max_partition_size)
+ partition = create_partition (npartitions, "");
+
+ add_symbol_to_partition (partition, node);
+ n_partitioned_symbols++;
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "partition_by_tp_first_run: Total partitions: %d\n",
npartitions);
+ fprintf (dump_file,
+ "partition_by_tp_first_run: Number of partitioned symbols: %d\n",
+ n_partitioned_symbols);
+ }
+
+ return partition;
+}
+
/* Group cgraph nodes into equally-sized partitions.
The partitioning algorithm is simple: nodes are taken in predefined order.
@@ -1114,7 +1167,7 @@ lto_balanced_map (int n_lto_partitions, int
max_partition_size)
int64_t cost = 0, internal = 0;
unsigned int best_n_nodes = 0, best_i = 0;
int64_t best_cost = -1, best_internal = 0, best_size = 0;
- int npartitions;
+ int npartitions = 0;
int current_order = -1;
int noreorder_pos = 0;
@@ -1134,6 +1187,19 @@ lto_balanced_map (int n_lto_partitions, int
max_partition_size)
original_total_size = total_size;
+ /* With -fauto-profile -fprofile-reorder-functions, place all symbols which
+ are profiled together into as few partitions as possible. The rationale
+ for doing this with AutoFDO is that the number of sampled functions is a
+ fraction of total number of executed functions (unlike PGO where each
+ executed function gets instrumented with time profile counter), and
+ placing them together helps with code locality. */
+
+ partition = NULL;
+ if (flag_auto_profile
+ && flag_profile_reorder_functions
+ && noreorder.length () == 0)
+ partition = partition_by_tp_first_run (max_partition_size, npartitions);
+
/* Streaming works best when the source units do not cross partition
boundaries much. This is because importing function from a source
unit tends to import a lot of global trees defined there. We should
@@ -1168,8 +1234,12 @@ lto_balanced_map (int n_lto_partitions, int
max_partition_size)
partition_size = total_size / n_lto_partitions;
if (partition_size < param_min_partition_size)
partition_size = param_min_partition_size;
- npartitions = 1;
- partition = new_partition ("");
+
+ if (npartitions == 0)
+ {
+ npartitions = 1;
+ partition = new_partition ("");
+ }
if (dump_file)
fprintf (dump_file, "Total unit size: %" PRId64 ", partition size: %"
PRId64 "\n",
total_size, partition_size);
@@ -1512,15 +1582,6 @@ add_node_references_to_partition (ltrans_partition
partition, symtab_node *node)
}
-/* Create and return the created partition of name NAME. */
-
-static ltrans_partition
-create_partition (int &npartitions, const char *name)
-{
- npartitions++;
- return new_partition (name);
-}
-
/* Partitioning for code locality.
The partitioning plan (and prerequisite cloning) will have been done by the
IPA locality cloning pass. This function just implements that plan by