Hi Honza,
The attached patch is split from the original patch and introduces new pass
ipa_adjust_tp_first_run for setting tp_first_run for clones that are
materialized after auto-profile pass.
I tried using function_summary instead of custom std::map<const char *, int,
string_compare> clone_timestamp_map, but am running into following issue:
AFAIU, function_summary works as a map from cgraph_node to the custom pass
summary. But in this case, we want summary <name, tp_first_run> for node's
profiled clones (and not node itself),
which would later get materialized during WPA and do not exist during
generate_summary stage of the pass.
I suppose we could maintain per each node summaries of all it's profiled clones
(name, tp_first_run) during generate_summary stage, and during WPA stage of
pass, fetch the correct clone from it's name and set it's tp_first_run ?
Altho if the original node gets removed from symtab during IPA opts, I am not
sure if we could access it's clones info from function_summary in the pass ?
In the attached patch, I have left clone_timestamp_map as-is.
Could you please suggest how to proceed ?
On average with this patch, I am seeing a ~2.5% uplift on top of base patch
(and the explicit partitioner based on tp_first_run).
Signed-off-by: Prathamesh Kulkarni <[email protected]>
Thanks,
Prathamesh
Add new pass ipa_adjust_tp_first_run.
The patch introduces a late IPA pass for assigning tp_first_run values to
clones with AutoFDO. During auto-profile pass, for clones that aren't
registered in call graph, stream <name, tp_first_run> from timestamp_info_map
during LGEN. And during WPA, stream back the info, and assign the field value.
The pass is placed sufficiently late (just before locality_cloning), that it
should have clones registered in the call graph.
gcc/ChangeLog:
* auto-profile.cc: Include lto-streamer.h and data-streamer.h.
(timestamp_info_map): Change value to std::pair<int, int> instead of
int.
(autofdo_source_profile::read): Call std::make_pair for populating
value field of timestamp_info_map.
(afdo_annotate_cfg): Use it->second.second instead of it->second for
assigning value to node->tp_first_run.
(clone_function_p): New function.
(clone_timestamp_map): New std::map for storing clone's name and
tp_first_run which are materialized after auto-profile pass.
(adjust_tp_first_run_generate_summary): Likewise.
(adjust_tp_first_run_write_summary): Likewise.
(adjust_tp_first_run_read_summary): Likewise.
(adjust_tp_first_run): Likewise.
(class pass_ipa_adjust_tp_first_run): New pass.
(make_pass_ipa_adjust_tp_first_run): New function.
* lto-section-in.cc: Add entry "ipa_adjust_tp_first_run".
* lto-streamer.h (enum lto_section_type): Add entry for
LTO_section_ipa_adjust_tp_first_run.
* passes.def: Add entry for pass_ipa_adjust_tp_first_run.
* tree-pass.h (make_pass_ipa_adjust_tp_first_run): Declare.
Signed-off-by: Prathamesh Kulkarni <[email protected]>
diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 2a55e4ffbaf..ed0c9f1b90d 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -54,6 +54,8 @@ along with GCC; see the file COPYING3. If not see
#include "tree-pretty-print.h"
#include "gimple-pretty-print.h"
#include "output.h"
+#include "lto-streamer.h"
+#include "data-streamer.h"
/* The following routines implement AutoFDO optimization.
@@ -707,7 +709,7 @@ static gcov_summary *afdo_profile_info;
The purpose of this map is to map 64-bit timestamp values to (1..N) sorted
by ascending order of timestamps and assign that to node->tp_first_run,
since we don't need the full 64-bit range. */
-static std::map<gcov_type, int> timestamp_info_map;
+static std::map<gcov_type, std::pair<int, int>> timestamp_info_map;
/* Scaling factor for afdo data. Compared to normal profile
AFDO profile counts are much lower, depending on sampling
@@ -2969,7 +2971,8 @@ autofdo_source_profile::read ()
afdo_string_table->get_symbol_name (s->symbol_name ()));
s->prop_timestamp ();
- timestamp_info_map.insert({s->timestamp (), 0});
+ timestamp_info_map.insert({s->timestamp (),
+ std::make_pair (s->symbol_name (), 0)});
}
/* timestamp_info_map is std::map with timestamp as key,
@@ -2980,7 +2983,7 @@ autofdo_source_profile::read ()
int tp_first_run = 1;
for (auto &p : timestamp_info_map)
- p.second = tp_first_run++;
+ p.second.second = tp_first_run++;
int hot_frac = param_hot_bb_count_fraction;
/* Scale up the profile, but leave some bits in case some counts gets
@@ -4385,7 +4388,7 @@ afdo_annotate_cfg (void)
it != timestamp_info_map.end ())
{
cgraph_node *node = cgraph_node::get (current_function_decl);
- node->tp_first_run = it->second;
+ node->tp_first_run = it->second.second;
if (dump_file)
fprintf (dump_file, "Setting %s->tp_first_run to %d\n",
@@ -4805,3 +4808,182 @@ make_pass_ipa_auto_profile_offline (gcc::context *ctxt)
{
return new pass_ipa_auto_profile_offline (ctxt);
}
+
+/* Map from name -> tp_first_run for clones. We need this for initiailizing
tp_first_run
+ of clones created after auto-profile pass. */
+
+static std::map<const char *, int, autofdo::string_compare>
clone_timestamp_map;
+
+/* Check if name is a clone function name. LTO privatized clones should already
+ be handled with sourcefile tracking. */
+
+static bool
+clone_function_p (const char *name)
+{
+ return strchr (name, '.') && !strstr (name, "lto_priv");
+}
+
+/* Gather <name, tp_first_run> value from autofdo::timestamp_info_map. */
+
+static void
+adjust_tp_first_run_generate_summary (void)
+{
+ for (const auto &p : autofdo::timestamp_info_map)
+ {
+ gcov_type timestamp = p.first;
+ int name_id = p.second.first;
+ int tp_first_run = p.second.second;
+
+ if (const char *name
+ = autofdo::afdo_string_table->get_symbol_name (name_id);
+ name && clone_function_p (name))
+ clone_timestamp_map.insert({xstrdup (name), tp_first_run});
+ }
+}
+
+/* Stream out <name, tp_first_run>. */
+
+static void
+adjust_tp_first_run_write_summary (void)
+{
+ struct output_block *ob
+ = create_output_block (LTO_section_ipa_adjust_tp_first_run);
+
+ streamer_write_uhwi (ob, clone_timestamp_map.size ());
+ for (const auto &p : clone_timestamp_map)
+ {
+ streamer_write_string (ob, ob->main_stream, p.first, true);
+ streamer_write_hwi (ob, p.second);
+ }
+
+ produce_asm (ob);
+ destroy_output_block (ob);
+}
+
+/* Stream in <name, tp_first_run> and put it in clone_timestamp_map. */
+
+static void
+adjust_tp_first_run_read_summary (void)
+{
+ struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data ();
+ struct lto_file_decl_data *file_data;
+ unsigned j = 0;
+
+ while ((file_data = file_data_vec[j++]))
+ {
+ size_t len;
+ const char *data
+ = lto_get_summary_section_data (file_data,
+ LTO_section_ipa_adjust_tp_first_run,
+ &len);
+ if (!data)
+ continue;
+
+ const struct lto_function_header *header
+ = (const struct lto_function_header *) data;
+ const int cfg_offset = sizeof (struct lto_function_header);
+ const int main_offset = cfg_offset + header->cfg_size;
+ const int string_offset = main_offset + header->main_size;
+ struct data_in *data_in;
+
+ lto_input_block ib ((const char *) data + main_offset,
+ header->main_size, file_data);
+
+ data_in
+ = lto_data_in_create (file_data, (const char *) data + string_offset,
+ header->string_size, vNULL);
+
+ unsigned num_clones = streamer_read_uhwi (&ib);
+ for (unsigned i = 0; i < num_clones; i++)
+ {
+ const char *fn_name = streamer_read_string (data_in, &ib);
+ int tp_first_run = streamer_read_hwi (&ib);
+ clone_timestamp_map.insert ({xstrdup (fn_name), tp_first_run});
+ }
+
+ lto_free_section_data (file_data, LTO_section_ipa_adjust_tp_first_run,
+ NULL, data, len);
+ lto_data_in_delete (data_in);
+ }
+}
+
+/* A late IPA pass for handling clones.
+ During auto-profile pass, for clones that aren't registered in call graph,
+ stream <name, tp_first_run> from timestamp_info_map during LGEN.
+ And during WPA, stream back the info, and assign the field value.
+ The pass is placed sufficiently late (just before locality_cloning), that
+ it should have clones registered in the call graph. */
+
+static unsigned
+adjust_tp_first_run ()
+{
+ for (const auto &p : clone_timestamp_map)
+ if (cgraph_node *node
+ = cgraph_node::get_for_asmname (get_identifier (p.first)))
+ {
+ node->tp_first_run = p.second;
+ if (dump_file)
+ fprintf (dump_file, "Setting %s->tp_first_run to %d\n",
+ node->asm_name (), node->tp_first_run);
+ }
+
+ for (const auto &p : clone_timestamp_map)
+ free ((void *) p.first);
+ clone_timestamp_map.clear ();
+
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_ipa_adjust_tp_first_run =
+{
+ IPA_PASS, /* type */
+ "adjust-tp_first_run", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_CGRAPHOPT, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_ipa_adjust_tp_first_run : public ipa_opt_pass_d
+{
+public:
+ pass_ipa_adjust_tp_first_run (gcc::context *ctxt)
+ : ipa_opt_pass_d (pass_data_ipa_adjust_tp_first_run, ctxt,
+ adjust_tp_first_run_generate_summary, /* generate_summary
*/
+ adjust_tp_first_run_write_summary, /* write_summary */
+ adjust_tp_first_run_read_summary, /* read_summary */
+ NULL, /* write_optimization_summary */
+ NULL, /* read_optimization_summary */
+ NULL, /* stmt_fixup */
+ 0, /* function_transform_todo_flags_start */
+ NULL, /* function_transform */
+ NULL) /* variable_transform */
+ {}
+
+ /* opt_pass methods: */
+ bool
+ gate (function *) final override
+ {
+ return flag_auto_profile;
+ }
+
+ unsigned
+ execute (function *) final override
+ {
+ return adjust_tp_first_run ();
+ }
+
+}; // class pass_ipa_adjust_tp_first_run
+
+} // anon namespace
+
+ipa_opt_pass_d *
+make_pass_ipa_adjust_tp_first_run (gcc::context *ctxt)
+{
+ return new pass_ipa_adjust_tp_first_run (ctxt);
+}
diff --git a/gcc/lto-section-in.cc b/gcc/lto-section-in.cc
index bf8621925dc..79e34f29d41 100644
--- a/gcc/lto-section-in.cc
+++ b/gcc/lto-section-in.cc
@@ -57,6 +57,7 @@ const char *lto_section_name[LTO_N_SECTION_TYPES] =
"ipa_sra",
"odr_types",
"ipa_modref",
+ "ipa_adjust_tp_first_run",
};
/* Hooks so that the ipa passes can call into the lto front end to get
diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
index c7c6c8fbf06..90fab7fca48 100644
--- a/gcc/lto-streamer.h
+++ b/gcc/lto-streamer.h
@@ -225,6 +225,7 @@ enum lto_section_type
LTO_section_ipa_sra,
LTO_section_odr_types,
LTO_section_ipa_modref,
+ LTO_section_ipa_adjust_tp_first_run,
LTO_N_SECTION_TYPES /* Must be last. */
};
diff --git a/gcc/passes.def b/gcc/passes.def
index fac04cd86c7..fae433befc2 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -171,6 +171,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_ipa_sra);
NEXT_PASS (pass_ipa_fn_summary);
NEXT_PASS (pass_ipa_inline);
+ NEXT_PASS (pass_ipa_adjust_tp_first_run);
NEXT_PASS (pass_ipa_locality_cloning);
NEXT_PASS (pass_ipa_pure_const);
NEXT_PASS (pass_ipa_modref);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 410341d4711..55909683b23 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -552,6 +552,7 @@ extern ipa_opt_pass_d *make_pass_ipa_cdtor_merge
(gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_single_use (gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_comdats (gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_modref (gcc::context *ctxt);
+extern ipa_opt_pass_d *make_pass_ipa_adjust_tp_first_run (gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_locality_cloning (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_cleanup_cfg_post_optimizing (gcc::context