> diff --git a/gcc/ChangeLog b/gcc/ChangeLog > index c566a85..1562098 100644 > --- a/gcc/ChangeLog > +++ b/gcc/ChangeLog > @@ -1,3 +1,15 @@ > +2013-11-13 Martin Liska <marxin.li...@gmail.com> > + Jan Hubicka <j...@suse.cz> > + > + * cgraphunit.c (node_cmp): New function. > + (expand_all_functions): Function ordering added. > + * common.opt: New profile based function reordering flag introduced. > + * coverage.c (get_coverage_counts): Wrong profile handled. > + * ipa.c (cgraph_externally_visible_p): New late flag introduced. > + * lto-partition.c: Support for time profile added. > + * lto.c: Likewise. > + * value-prof.c: Histogram instrumentation switch added. > + > 2013-11-13 Vladimir Makarov <vmaka...@redhat.com> > > PR rtl-optimization/59036 > diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c > index 4765e6a..7cdd9a4 100644 > --- a/gcc/cgraphunit.c > +++ b/gcc/cgraphunit.c > @@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node) > ipa_remove_all_references (&node->ref_list); > } > > +/* Node comparer that is responsible for the order that corresponds > + to time when a function was launched for the first time. */ > + > +static int > +node_cmp (const void *pa, const void *pb) > +{ > + const struct cgraph_node *a = *(const struct cgraph_node * const *) pa; > + const struct cgraph_node *b = *(const struct cgraph_node * const *) pb; > + > + return b->tp_first_run - a->tp_first_run;
Please stabilize this by using node->order when tp_first_run is equivalent. Later we ought to use better heuristic here, but order may be good enough to start with. > diff --git a/gcc/ipa.c b/gcc/ipa.c > index a11b1c7..d92a332 100644 > --- a/gcc/ipa.c > +++ b/gcc/ipa.c > @@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node, > This improves code quality and we know we will duplicate them at most > twice > (in the case that we are not using plugin and link with object file > implementing same COMDAT) */ > - if ((in_lto_p || whole_program) > - && DECL_COMDAT (node->decl) > - && comdat_can_be_unshared_p (node)) > - return false; > + if ((in_lto_p || whole_program || profile_arc_flag) > + && DECL_COMDAT (node->decl) > + && comdat_can_be_unshared_p (node)) > + { > + gcc_checking_assert (cgraph_function_body_availability (node) > + > AVAIL_OVERWRITABLE); > + return false; > + } > > /* When doing link time optimizations, hidden symbols become local. */ > if (in_lto_p > @@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program) > } > gcc_assert ((!DECL_WEAK (node->decl) > && !DECL_COMDAT (node->decl)) > - || TREE_PUBLIC (node->decl) > + || TREE_PUBLIC (node->decl) > || node->weakref > || DECL_EXTERNAL (node->decl)); > if (cgraph_externally_visible_p (node, whole_program)) > @@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program) > && node->definition && !node->weakref > && !DECL_EXTERNAL (node->decl)) > { > - gcc_assert (whole_program || in_lto_p > + gcc_assert (whole_program || in_lto_p || profile_arc_flag > || !TREE_PUBLIC (node->decl)); > node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY > || node->resolution == > LDPR_PREVAILING_DEF_IRONLY_EXP) These changes are unrelated, please remove them. > @@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb) > { > const struct cgraph_node *a = *(const struct cgraph_node * const *) pa; > const struct cgraph_node *b = *(const struct cgraph_node * const *) pb; > + > + /* Profile reorder flag enables function reordering based on first > execution > + of a function. All functions with profile are placed in ascending > + order at the beginning. */ > + > + if (flag_profile_reorder_functions) && a->tp_first_run != b->tp_first_run > + { > + if (a->tp_first_run && b->tp_first_run) > + return a->tp_first_run - b->tp_first_run; > + > + if (a->tp_first_run || b->tp_first_run) > + return b->tp_first_run - a->tp_first_run; Drop a comment explaining the logic here ;) > @@ -449,7 +465,7 @@ void > lto_balanced_map (void) > { > int n_nodes = 0; > - int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0; > + int n_varpool_nodes = 0, varpool_pos = 0; > struct cgraph_node **order = XNEWVEC (struct cgraph_node *, > cgraph_max_uid); > struct varpool_node **varpool_order = NULL; > int i; > @@ -481,10 +497,13 @@ lto_balanced_map (void) > get better about minimizing the function bounday, but until that > things works smoother if we order in source order. */ > qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp); > + > + if (cgraph_dump_file) > + for(i = 0; i < n_nodes; i++) > + fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", > cgraph_node_asm_name (order[i]), order[i]->tp_first_run); > + > if (!flag_toplevel_reorder) > { > - qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp); > - > FOR_EACH_VARIABLE (vnode) > if (get_symbol_class (vnode) == SYMBOL_PARTITION) > n_varpool_nodes++; Good catch (the redundant qsort) > @@ -683,7 +702,6 @@ lto_balanced_map (void) > best_i = i; > best_n_nodes = lto_symtab_encoder_size (partition->encoder); > best_total_size = total_size; > - best_varpool_pos = varpool_pos; > } > if (cgraph_dump_file) > fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i " > @@ -701,7 +719,6 @@ lto_balanced_map (void) > fprintf (cgraph_dump_file, "Unwinding %i insertions to step > %i\n", > i - best_i, best_i); > undo_partition (partition, best_n_nodes); > - varpool_pos = best_varpool_pos; > } > i = best_i; > /* When we are finished, avoid creating empty partition. */ This actually reverts a bugfix, so please remove the two changes above. > diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c > index 62856d0..2401c76 100644 > --- a/gcc/lto/lto.c > +++ b/gcc/lto/lto.c > @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void) > /* Sort partitions by size so small ones are compiled last. > FIXME: Even when not reordering we may want to output one list for > parallel make > and other for final link command. */ > - ltrans_partitions.qsort (flag_toplevel_reorder > + > + if (!flag_profile_reorder_functions || !flag_profile_use) > + ltrans_partitions.qsort (flag_toplevel_reorder > ? cmp_partitions_size > : cmp_partitions_order); > + Add a FIXME explaining that we ought to produce two list - one for Make parallelizm and one for final link time. lets work on this incrementally. > diff --git a/gcc/varasm.c b/gcc/varasm.c > index 2226912..758367f 100644 > --- a/gcc/varasm.c > +++ b/gcc/varasm.c > @@ -548,7 +548,14 @@ default_function_section (tree decl, enum node_frequency > freq, > unlikely executed (this happens especially with function splitting > where we can split away unnecessary parts of static constructors. */ > if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED) > - return get_named_text_section (decl, ".text.startup", NULL); > + { > + /* If we do have a profile or(and) LTO phase is executed, we do not > need > + these ELF section. */ > + if (!in_lto_p || !flag_profile_values) > + return get_named_text_section (decl, ".text.startup", NULL); > + else > + return NULL; > + } > > /* Similarly for exit. */ > if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED) > @@ -560,7 +567,10 @@ default_function_section (tree decl, enum node_frequency > freq, > case NODE_FREQUENCY_UNLIKELY_EXECUTED: > return get_named_text_section (decl, ".text.unlikely", NULL); > case NODE_FREQUENCY_HOT: > - return get_named_text_section (decl, ".text.hot", NULL); > + /* If we do have a profile or(and) LTO phase is executed, we do not > need > + these ELF section. */ > + if (!in_lto_p || !flag_profile_values) > + return get_named_text_section (decl, ".text.hot", NULL); > default: > return NULL; > } Lets handle this in separate patch. The patch is missing documentation in doc/invoke.texi. You need to document the new command line option and update documentation of -fprofile-use. Please add it and send an update patch. Also Teresa recently sent patch for resetting profiles of functions where we clearly missed the profile. Please add there a code filling in tp_first_run from largest first_run of the callers with non-0 count. Thanks, Honza