Hi, just for those who are interested, this is quick&dirty patch adding another pass of local optimization passes at WPA time. I've added early inliner and IPA-SRA because I was curious how much of optimization oppurtunities we are missing by limiting those to early pass.
With Early inlining it seems to be very little. We inline one extra call when building Mozilla in LTO mode. IPA SRA is different story. While we do 579 IPA SRA clones in the early pass, the late pass produces 13014 clones (22 times more ;) suggesting that the pass might be interesting at IPA level after all. There are 78686 functions after inlining in Mozilla, so one out of 7 functions is touched. Size difference of libxul is not great, about 100Kb reduction. I will try benchmarking it eventually, too. Honza Index: cgraph.c =================================================================== *** cgraph.c (revision 175350) --- cgraph.c (working copy) *************** cgraph_release_function_body (struct cgr *** 1389,1396 **** } if (cfun->cfg) { ! gcc_assert (dom_computed[0] == DOM_NONE); ! gcc_assert (dom_computed[1] == DOM_NONE); clear_edges (); } if (cfun->value_histograms) --- 1393,1403 ---- } if (cfun->cfg) { ! /*gcc_assert (dom_computed[0] == DOM_NONE); ! gcc_assert (dom_computed[1] == DOM_NONE);*/ ! free_dominance_info (CDI_DOMINATORS); ! free_dominance_info (CDI_POST_DOMINATORS); ! clear_edges (); } if (cfun->value_histograms) Index: tree-pass.h =================================================================== *** tree-pass.h (revision 175350) --- tree-pass.h (working copy) *************** extern struct simple_ipa_opt_pass pass_i *** 452,458 **** extern struct simple_ipa_opt_pass pass_ipa_function_and_variable_visibility; extern struct simple_ipa_opt_pass pass_ipa_tree_profile; ! extern struct simple_ipa_opt_pass pass_early_local_passes; extern struct ipa_opt_pass_d pass_ipa_whole_program_visibility; extern struct ipa_opt_pass_d pass_ipa_lto_gimple_out; --- 452,458 ---- extern struct simple_ipa_opt_pass pass_ipa_function_and_variable_visibility; extern struct simple_ipa_opt_pass pass_ipa_tree_profile; ! extern struct simple_ipa_opt_pass pass_early_local_passes, pass_late_local_passes, pass_late_local_passes2; extern struct ipa_opt_pass_d pass_ipa_whole_program_visibility; extern struct ipa_opt_pass_d pass_ipa_lto_gimple_out; Index: ipa-inline-analysis.c =================================================================== *** ipa-inline-analysis.c (revision 175350) --- ipa-inline-analysis.c (working copy) *************** estimate_function_body_sizes (struct cgr *** 1535,1542 **** edge->call_stmt_cannot_inline_p = true; gimple_call_set_cannot_inline (stmt, true); } ! else ! gcc_assert (!gimple_call_cannot_inline_p (stmt)); } /* TODO: When conditional jump or swithc is known to be constant, but --- 1535,1542 ---- edge->call_stmt_cannot_inline_p = true; gimple_call_set_cannot_inline (stmt, true); } ! /*else ! gcc_assert (!gimple_call_cannot_inline_p (stmt));*/ } /* TODO: When conditional jump or swithc is known to be constant, but Index: tree-inline.c =================================================================== *** tree-inline.c (revision 175350) --- tree-inline.c (working copy) *************** expand_call_inline (basic_block bb, gimp *** 3891,3897 **** id->src_cfun = DECL_STRUCT_FUNCTION (fn); id->gimple_call = stmt; ! gcc_assert (!id->src_cfun->after_inlining); id->entry_bb = bb; if (lookup_attribute ("cold", DECL_ATTRIBUTES (fn))) --- 3891,3897 ---- id->src_cfun = DECL_STRUCT_FUNCTION (fn); id->gimple_call = stmt; ! /*gcc_assert (!id->src_cfun->after_inlining);*/ id->entry_bb = bb; if (lookup_attribute ("cold", DECL_ATTRIBUTES (fn))) Index: tree-optimize.c =================================================================== *** tree-optimize.c (revision 175350) --- tree-optimize.c (working copy) *************** struct simple_ipa_opt_pass pass_early_lo *** 123,128 **** --- 123,189 ---- /* Gate: execute, or not, all of the non-trivial optimizations. */ static bool + gate_all_late_local_passes (void) + { + /* Don't bother doing anything if the program has errors. */ + return (!seen_error () && optimize); + } + + static unsigned int + execute_all_late_local_passes (void) + { + /* Once this pass (and its sub-passes) are complete, all functions + will be in SSA form. Technically this state change is happening + a tad late, since the sub-passes have not yet run, but since + none of the sub-passes are IPA passes and do not create new + functions, this is ok. We're setting this value for the benefit + of IPA passes that follow. */ + if (cgraph_state < CGRAPH_STATE_IPA_SSA) + cgraph_state = CGRAPH_STATE_IPA_SSA; + return 0; + } + + struct simple_ipa_opt_pass pass_late_local_passes = + { + { + SIMPLE_IPA_PASS, + "late_local_cleanups", /* name */ + gate_all_late_local_passes, /* gate */ + execute_all_late_local_passes, /* execute */ + NULL, /* sub */ + NULL, /* next */ + 0, /* static_pass_number */ + TV_EARLY_LOCAL, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_remove_functions /* todo_flags_finish */ + } + }; + + struct simple_ipa_opt_pass pass_late_local_passes2 = + { + { + SIMPLE_IPA_PASS, + "late_local_cleanups2", /* name */ + gate_all_late_local_passes, /* gate */ + execute_all_late_local_passes, /* execute */ + NULL, /* sub */ + NULL, /* next */ + 0, /* static_pass_number */ + TV_EARLY_LOCAL, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_remove_functions /* todo_flags_finish */ + } + }; + + /* Gate: execute, or not, all of the non-trivial optimizations. */ + + static bool gate_all_early_optimizations (void) { return (optimize >= 1 Index: passes.c =================================================================== *** passes.c (revision 175350) --- passes.c (working copy) *************** init_optimization_passes (void) *** 1263,1268 **** --- 1263,1288 ---- passes are executed after partitioning and thus see just parts of the compiled unit. */ p = &all_late_ipa_passes; + NEXT_PASS (pass_late_local_passes); + { + struct opt_pass **p = &pass_late_local_passes.pass.sub; + NEXT_PASS (pass_inline_parameters); + NEXT_PASS (pass_release_ssa_names); + } + NEXT_PASS (pass_late_local_passes2); + { + struct opt_pass **p = &pass_late_local_passes2.pass.sub; + NEXT_PASS (pass_early_inline); + NEXT_PASS (pass_remove_cgraph_callee_edges); + NEXT_PASS (pass_ccp); + NEXT_PASS (pass_forwprop); + NEXT_PASS (pass_fre); + NEXT_PASS (pass_cd_dce); + NEXT_PASS (pass_early_ipa_sra); + NEXT_PASS (pass_release_ssa_names); + NEXT_PASS (pass_rebuild_cgraph_edges); + NEXT_PASS (pass_inline_parameters); + } NEXT_PASS (pass_ipa_pta); *p = NULL; /* These passes are run after IPA passes on every function that is being Index: statistics.c =================================================================== *** statistics.c (revision 175350) --- statistics.c (working copy) *************** statistics_fini_pass_3 (void **slot, voi *** 171,176 **** --- 171,178 ---- void statistics_fini_pass (void) { + if (!current_pass) + return; if (current_pass->static_pass_number == -1) return;