https://gcc.gnu.org/g:5c0758c174c596215857427092e33353f4c1fa72

commit r16-2037-g5c0758c174c596215857427092e33353f4c1fa72
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Sun Jul 6 14:42:54 2025 +0200

    Add cutoff information to profile_info and use it when forcing non-zero 
value
    
    Main difference between normal profile feedback and auto-fdo is that with 
profile
    feedback every basic block with non-zero profile has an incomming edge with 
non-zero
    profile.  With auto-profile it is possible that none of predecessors was 
sampled
    and also the tool has cutoff parameter which makes it to ignore small 
counts.
    
    This becomes a problem when one tries to specialize code and scale profile.
    For exmaple if inline function happens to have hot loop with non-zero counts
    but its entry count has zero counts and we want to inline to zero counts 
and we
    want to inline to a call with a non-zero count X, we want to scale the body 
by
    X/0 which we currently turn into X/1.
    
    This is a problem since I added logic to scale up the auto-profiles (to get
    some extra bits of precision) so X is often a large value and multiplying 
by X
    is not a right answer at all.  The multiply factor should be <= 1.
    
    Iterating this few times will make counts to cap and we will lost any 
useful info.
    Original implementation avoided this by doing all inlines before AFDO 
readback,
    bit this is not possible with LTO (unless we move AFDO readback to WPA or 
add
    support for context sensitive profiles).  I think I can get the scaling work
    reasonably well and then we can look into possible benefits of context 
sensitive
    profiling which can be implemented both atop of AFDO as well as FDO.
    
    This patch adds cutoff value to profile_info which is initialized by profile
    feedback to 1 and by auto-profile to the scale factor (since we do not know 
the
    cutoff create_gcov used; llvm's tool streams it and we probably should too).
    Then force_nonzero forces every value smaller than cutoff/2 to cutoff/2 
which
    should keep scaling factors in reasonable ranges.
    
    gcc/ChangeLog:
    
            * auto-profile.cc
            (autofdo_source_profile::read): Scale cutoff.
            (read_autofdo_file): Initialize cutoff
            * coverage.cc (read_counts_file): Initialize cutoff to 1.
            * gcov-io.h (struct gcov_summary): Add cutoff field.
            * ipa-inline.cc (inline_small_functions): mac_count can be non-zero
            also with auto_profile.
            * lto-cgraph.cc (output_profile_summary): Write cutoff
            and sum_max.
            (input_profile_summary): Read cutoff and sum max.
            (merge_profile_summaries): Initialize and scale global cutoffs
            and sum max.
            * profile-count.cc: Include profile.h
            (profile_count::force_nonzero): move here from ...; use cutoff.
            * profile-count.h: (profile_count::force_nonzero): ... here.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/tree-prof/clone-merge-1.c:

Diff:
---
 gcc/auto-profile.cc                            |  4 ++++
 gcc/coverage.cc                                |  1 +
 gcc/gcov-io.h                                  |  5 +++++
 gcc/ipa-inline.cc                              |  1 +
 gcc/lto-cgraph.cc                              | 19 ++++++++++++++++---
 gcc/profile-count.cc                           | 25 +++++++++++++++++++++++++
 gcc/profile-count.h                            | 13 +------------
 gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c |  2 +-
 8 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 64f4cda1b52d..a970eb8972fa 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -2522,6 +2522,7 @@ autofdo_source_profile::read ()
     afdo_count_scale
       = MAX (((gcov_type)1 << (profile_count::n_bits / 2))
             / afdo_profile_info->sum_max, 1);
+  afdo_profile_info->cutoff *= afdo_count_scale;
   afdo_hot_bb_threshod
     = hot_frac
       ? afdo_profile_info->sum_max * afdo_count_scale / hot_frac
@@ -2531,10 +2532,12 @@ autofdo_source_profile::read ()
     fprintf (dump_file, "Max count in profile %" PRIu64 "\n"
                        "Setting scale %" PRIu64 "\n"
                        "Scaled max count %" PRIu64 "\n"
+                       "Cutoff %" PRIu64 "\n"
                        "Hot count threshold %" PRIu64 "\n\n",
             (int64_t)afdo_profile_info->sum_max,
             (int64_t)afdo_count_scale,
             (int64_t)(afdo_profile_info->sum_max * afdo_count_scale),
+            (int64_t)afdo_profile_info->cutoff,
             (int64_t)afdo_hot_bb_threshod);
   afdo_profile_info->sum_max *= afdo_count_scale;
   return true;
@@ -3865,6 +3868,7 @@ read_autofdo_file (void)
   autofdo::afdo_profile_info = XNEW (gcov_summary);
   autofdo::afdo_profile_info->runs = 1;
   autofdo::afdo_profile_info->sum_max = 0;
+  autofdo::afdo_profile_info->cutoff = 1;
 
   /* Read the profile from the profile file.  */
   autofdo::read_profile ();
diff --git a/gcc/coverage.cc b/gcc/coverage.cc
index dd3ed2ed8429..75a24c614486 100644
--- a/gcc/coverage.cc
+++ b/gcc/coverage.cc
@@ -238,6 +238,7 @@ read_counts_file (void)
          gcov_profile_info = profile_info = XCNEW (gcov_summary);
          profile_info->runs = gcov_read_unsigned ();
          profile_info->sum_max = gcov_read_unsigned ();
+         profile_info->cutoff = 1;
        }
       else if (GCOV_TAG_IS_COUNTER (tag) && fn_ident)
        {
diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h
index d48291c1fe35..f3e3a1c08da8 100644
--- a/gcc/gcov-io.h
+++ b/gcc/gcov-io.h
@@ -349,6 +349,11 @@ struct gcov_summary
 {
   gcov_unsigned_t runs;                /* Number of program runs.  */
   gcov_type sum_max;           /* Sum of individual run max values.  */
+  gcov_type cutoff;            /* Values smaller than this value are not
+                                  reliable (0 may mean non-zero).
+                                  For read profile cutoff is typically 1
+                                  however when we scale up or use auto-fdo
+                                  it may become bigger value.  */
 };
 
 #if !defined(inhibit_libc)
diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index ca605b027dcf..0cf97a80687e 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -2222,6 +2222,7 @@ inline_small_functions (void)
 
   gcc_assert (in_lto_p
              || !(max_count > 0)
+             || flag_auto_profile
              || (profile_info && flag_branch_probabilities));
 
   while (!edge_heap.empty ())
diff --git a/gcc/lto-cgraph.cc b/gcc/lto-cgraph.cc
index ec34f659d6a4..0af2e889af85 100644
--- a/gcc/lto-cgraph.cc
+++ b/gcc/lto-cgraph.cc
@@ -718,11 +718,12 @@ output_profile_summary (struct lto_simple_output_block 
*ob)
 {
   if (profile_info)
     {
-      /* We do not output num and run_max, they are not used by
-         GCC profile feedback and they are difficult to merge from multiple
-         units.  */
       unsigned runs = (profile_info->runs);
       streamer_write_uhwi_stream (ob->main_stream, runs);
+      streamer_write_gcov_count_stream (ob->main_stream,
+                                       profile_info->sum_max);
+      streamer_write_gcov_count_stream (ob->main_stream,
+                                       profile_info->cutoff);
 
       /* IPA-profile computes hot bb threshold based on cumulated
         whole program profile.  We need to stream it down to ltrans.  */
@@ -1678,6 +1679,8 @@ input_profile_summary (class lto_input_block *ib,
   if (runs)
     {
       file_data->profile_info.runs = runs;
+      file_data->profile_info.sum_max = streamer_read_gcov_count (ib);
+      file_data->profile_info.cutoff = streamer_read_gcov_count (ib);
 
       /* IPA-profile computes hot bb threshold based on cumulated
         whole program profile.  We need to stream it down to ltrans.  */
@@ -1719,6 +1722,8 @@ merge_profile_summaries (struct lto_file_decl_data 
**file_data_vec)
 
   profile_info = XCNEW (gcov_summary);
   profile_info->runs = max_runs;
+  profile_info->sum_max = 0;
+  profile_info->cutoff = 0;
 
   /* If merging already happent at WPA time, we are done.  */
   if (flag_ltrans)
@@ -1735,6 +1740,14 @@ merge_profile_summaries (struct lto_file_decl_data 
**file_data_vec)
 
        scale = RDIV (node->count_materialization_scale * max_runs,
                       node->lto_file_data->profile_info.runs);
+       gcov_type sum_max = RDIV (node->lto_file_data->profile_info.sum_max * 
max_runs,
+                                 node->lto_file_data->profile_info.runs);
+       gcov_type cutoff = RDIV (node->lto_file_data->profile_info.cutoff * 
max_runs,
+                                node->lto_file_data->profile_info.runs);
+       if (sum_max > profile_info->sum_max)
+         profile_info->sum_max = sum_max;
+       if (cutoff > profile_info->cutoff)
+         profile_info->cutoff = cutoff;
        node->count_materialization_scale = scale;
        if (scale < 0)
          fatal_error (input_location, "Profile information in %s corrupted",
diff --git a/gcc/profile-count.cc b/gcc/profile-count.cc
index 21477008b702..8f05a79a4372 100644
--- a/gcc/profile-count.cc
+++ b/gcc/profile-count.cc
@@ -32,6 +32,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cgraph.h"
 #include "wide-int.h"
 #include "sreal.h"
+#include "profile.h"
 
 /* Names from profile_quality enum values.  */
 
@@ -570,3 +571,27 @@ profile_count::operator*= (const sreal &num)
 {
   return *this * num;
 }
+
+/* Make counter forcibly nonzero.  */
+profile_count
+profile_count::force_nonzero () const
+{
+  if (!initialized_p ())
+    return *this;
+  profile_count ret = *this;
+  /* Generally values are forced non-zero to handle inconsistent profile 
+     where count 0 needs to be scaled up to non-zero.
+
+     Use cutoff value here to avoid situation where profile has large
+     cutoff and we perform count = count * num / den where num is non-zero
+     and den is 0.   If profile was scaled by large factor, forcing value
+     to 1 would lead to large scale factor.  */
+  gcov_unsigned_t small = profile_info ? profile_info->cutoff / 2 + 1
+                         : 1;
+  if (ret.m_val < small)
+    {
+      ret.m_val = small;
+      ret.m_quality = MIN (m_quality, ADJUSTED);
+    }
+  return ret;
+}
diff --git a/gcc/profile-count.h b/gcc/profile-count.h
index 216054033c52..20c03a292382 100644
--- a/gcc/profile-count.h
+++ b/gcc/profile-count.h
@@ -1112,18 +1112,7 @@ public:
     }
 
   /* Make counter forcibly nonzero.  */
-  profile_count force_nonzero () const
-    {
-      if (!initialized_p ())
-       return *this;
-      profile_count ret = *this;
-      if (ret.m_val == 0)
-       {
-         ret.m_val = 1;
-         ret.m_quality = MIN (m_quality, ADJUSTED);
-       }
-      return ret;
-    }
+  profile_count force_nonzero () const;
 
   profile_count max (profile_count other) const
     {
diff --git a/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c 
b/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c
index 43a909054b50..904dd0cfb28a 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c
@@ -31,4 +31,4 @@ int main()
 }
 /* We will have profiles for test2 and test2.constprop.0 that will have to be
    merged,  */
-/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate symbol test2" 
"afdo_offline"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate instance: test2" 
"afdo_offline"} } */

Reply via email to