> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c566a85..1562098 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,15 @@
> +2013-11-13   Martin Liska    <marxin.li...@gmail.com>
> +                                             Jan Hubicka  <j...@suse.cz>
> +
> +     * cgraphunit.c (node_cmp): New function.
> +     (expand_all_functions): Function ordering added.
> +     * common.opt: New profile based function reordering flag introduced.
> +     * coverage.c (get_coverage_counts): Wrong profile handled.
> +     * ipa.c (cgraph_externally_visible_p): New late flag introduced.
> +     * lto-partition.c: Support for time profile added.
> +     * lto.c: Likewise.
> +     * value-prof.c: Histogram instrumentation switch added.
> +
>  2013-11-13  Vladimir Makarov  <vmaka...@redhat.com>
>  
>       PR rtl-optimization/59036
> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
> index 4765e6a..7cdd9a4 100644
> --- a/gcc/cgraphunit.c
> +++ b/gcc/cgraphunit.c
> @@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node)
>    ipa_remove_all_references (&node->ref_list);
>  }
>  
> +/* Node comparer that is responsible for the order that corresponds
> +   to time when a function was launched for the first time.  */
> +
> +static int
> +node_cmp (const void *pa, const void *pb)
> +{
> +  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
> +  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
> +
> +  return b->tp_first_run - a->tp_first_run;

Please stabilize this by using node->order when tp_first_run is equivalent.
Later we ought to use better heuristic here, but order may be good enough to
start with.
> diff --git a/gcc/ipa.c b/gcc/ipa.c
> index a11b1c7..d92a332 100644
> --- a/gcc/ipa.c
> +++ b/gcc/ipa.c
> @@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node,
>       This improves code quality and we know we will duplicate them at most 
> twice
>       (in the case that we are not using plugin and link with object file
>        implementing same COMDAT)  */
> -  if ((in_lto_p || whole_program)
> -      && DECL_COMDAT (node->decl)
> -      && comdat_can_be_unshared_p (node))
> -    return false;
> +  if ((in_lto_p || whole_program || profile_arc_flag)
> +     && DECL_COMDAT (node->decl)
> +     && comdat_can_be_unshared_p (node))
> +    {
> +      gcc_checking_assert (cgraph_function_body_availability (node)
> +                        > AVAIL_OVERWRITABLE);
> +      return false;
> +    }
>  
>    /* When doing link time optimizations, hidden symbols become local.  */
>    if (in_lto_p
> @@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program)
>       }
>        gcc_assert ((!DECL_WEAK (node->decl)
>                 && !DECL_COMDAT (node->decl))
> -                       || TREE_PUBLIC (node->decl)
> +               || TREE_PUBLIC (node->decl)
>                 || node->weakref
>                 || DECL_EXTERNAL (node->decl));
>        if (cgraph_externally_visible_p (node, whole_program))
> @@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program)
>         && node->definition && !node->weakref
>         && !DECL_EXTERNAL (node->decl))
>       {
> -       gcc_assert (whole_program || in_lto_p
> +       gcc_assert (whole_program || in_lto_p || profile_arc_flag
>                     || !TREE_PUBLIC (node->decl));
>         node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY
>                                     || node->resolution == 
> LDPR_PREVAILING_DEF_IRONLY_EXP)

These changes are unrelated, please remove them.
> @@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb)
>  {
>    const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
>    const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
> +
> +  /* Profile reorder flag enables function reordering based on first 
> execution
> +     of a function. All functions with profile are placed in ascending
> +     order at the beginning.  */
> +
> +  if (flag_profile_reorder_functions)
                                       && a->tp_first_run != b->tp_first_run
> +  {
> +    if (a->tp_first_run && b->tp_first_run)
> +      return a->tp_first_run - b->tp_first_run;
> +
> +    if (a->tp_first_run || b->tp_first_run)
> +      return b->tp_first_run - a->tp_first_run;

Drop a comment explaining the logic here ;)
> @@ -449,7 +465,7 @@ void
>  lto_balanced_map (void)
>  {
>    int n_nodes = 0;
> -  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
> +  int n_varpool_nodes = 0, varpool_pos = 0;
>    struct cgraph_node **order = XNEWVEC (struct cgraph_node *, 
> cgraph_max_uid);
>    struct varpool_node **varpool_order = NULL;
>    int i;
> @@ -481,10 +497,13 @@ lto_balanced_map (void)
>       get better about minimizing the function bounday, but until that
>       things works smoother if we order in source order.  */
>    qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
> +
> +  if (cgraph_dump_file)
> +    for(i = 0; i < n_nodes; i++)
> +      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", 
> cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
> +
>    if (!flag_toplevel_reorder)
>      {
> -      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
> -
>        FOR_EACH_VARIABLE (vnode)
>       if (get_symbol_class (vnode) == SYMBOL_PARTITION)
>         n_varpool_nodes++;

Good catch (the redundant qsort)

> @@ -683,7 +702,6 @@ lto_balanced_map (void)
>         best_i = i;
>         best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>         best_total_size = total_size;
> -       best_varpool_pos = varpool_pos;
>       }
>        if (cgraph_dump_file)
>       fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
> @@ -701,7 +719,6 @@ lto_balanced_map (void)
>               fprintf (cgraph_dump_file, "Unwinding %i insertions to step 
> %i\n",
>                        i - best_i, best_i);
>             undo_partition (partition, best_n_nodes);
> -           varpool_pos = best_varpool_pos;
>           }
>         i = best_i;
>         /* When we are finished, avoid creating empty partition.  */

This actually reverts a bugfix, so please remove the two changes above.
> diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
> index 62856d0..2401c76 100644
> --- a/gcc/lto/lto.c
> +++ b/gcc/lto/lto.c
> @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
>    /* Sort partitions by size so small ones are compiled last.
>       FIXME: Even when not reordering we may want to output one list for 
> parallel make
>       and other for final link command.  */
> -  ltrans_partitions.qsort (flag_toplevel_reorder
> +
> +  if (!flag_profile_reorder_functions || !flag_profile_use)
> +    ltrans_partitions.qsort (flag_toplevel_reorder
>                          ? cmp_partitions_size
>                          : cmp_partitions_order);
> +

Add a FIXME explaining that we ought to produce two list - one for Make 
parallelizm and one for final link time.
lets work on this incrementally.
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index 2226912..758367f 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -548,7 +548,14 @@ default_function_section (tree decl, enum node_frequency 
> freq,
>       unlikely executed (this happens especially with function splitting
>       where we can split away unnecessary parts of static constructors.  */
>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> -    return get_named_text_section (decl, ".text.startup", NULL);
> +    {
> +      /* If we do have a profile or(and) LTO phase is executed, we do not 
> need
> +         these ELF section.  */
> +      if (!in_lto_p || !flag_profile_values)
> +        return get_named_text_section (decl, ".text.startup", NULL);
> +      else
> +        return NULL;
> +    }
>  
>    /* Similarly for exit.  */
>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> @@ -560,7 +567,10 @@ default_function_section (tree decl, enum node_frequency 
> freq,
>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>       return get_named_text_section (decl, ".text.unlikely", NULL);
>        case NODE_FREQUENCY_HOT:
> -     return get_named_text_section (decl, ".text.hot", NULL);
> +        /* If we do have a profile or(and) LTO phase is executed, we do not 
> need
> +           these ELF section.  */
> +        if (!in_lto_p || !flag_profile_values)
> +          return get_named_text_section (decl, ".text.hot", NULL);
>        default:
>       return NULL;
>      }
Lets handle this in separate patch.

The patch is missing documentation in doc/invoke.texi.  You need to document
the new command line option and update documentation of -fprofile-use.  Please
add it and send an update patch.

Also Teresa recently sent patch for resetting profiles of functions where we 
clearly missed the profile.
Please add there a code filling in tp_first_run from largest first_run of the 
callers with non-0 count.

Thanks,
Honza

Reply via email to