https://gcc.gnu.org/g:4237e952fd1307c6aa4975f446ef8d938f6e8d31
commit 4237e952fd1307c6aa4975f446ef8d938f6e8d31 Author: Artemiy Volkov <[email protected]> Date: Sat Sep 6 15:06:36 2025 -0600 gcc: introduce the dep_fusion pass Presently, the scheduler code only considers consecutive instructions for macro-op fusion (see sched-deps.cc::sched_macro_fuse_insns () for details). This patch introduces the new dep_fusion pass, which is intended to uncover more fusion opportunities by reordering eligible instructions to form fusible pairs (based solely on the value of the TARGET_SCHED_MACRO_FUSION_PAIR_P hook). This is achieved by using the RTL-SSA framework, and only the single-use instructions are considered for the first instruction of a pair. Aside from reordering instructions, this pass also sets the SCHED_GROUP flag for the second instruction so that following passes can implement special handling of the fused pairs. For instance, RA and regrename should make use of this information to preserve single-output property for some of such pairs. Accordingly, in passes.def, this patch adds two invocations of the new pass: just before IRA and just before regrename. The new pass is enabled at -O2+ and -Os. gcc/ChangeLog: * Makefile.in (OBJS): Add dep-fusion.o. * common.opt (fdep-fusion): Add option. * dep-fusion.cc: New pass. * doc/invoke.texi: Document it. * opts.cc (default_options_table): Enable it at -O2+ and -Os. * passes.def: Insert two instances of dep_fusion. * tree-pass.h (make_pass_dep_fusion): Declare new function. (cherry picked from commit 41b0c7a674e87074fdc8088479cb93f6fe1e070f) Diff: --- gcc/Makefile.in | 1 + gcc/common.opt | 4 ++ gcc/dep-fusion.cc | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/doc/invoke.texi | 15 ++++-- gcc/opts.cc | 1 + gcc/passes.def | 2 + gcc/tree-pass.h | 1 + 7 files changed, 169 insertions(+), 3 deletions(-) diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 01aa97722a51..04822e4429ad 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1447,6 +1447,7 @@ OBJS = \ dce.o \ ddg.o \ debug.o \ + dep-fusion.o \ df-core.o \ df-problems.o \ df-scan.o \ diff --git a/gcc/common.opt b/gcc/common.opt index e3fa0dacec4c..81e4126e7d90 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1362,6 +1362,10 @@ fdelete-null-pointer-checks Common Var(flag_delete_null_pointer_checks) Init(-1) Optimization Delete useless null pointer checks. +fdep-fusion +Common Var(flag_dep_fusion) Optimization Init(1) +Issue defining instructions back to back with their single uses, provided they are macro-fusible in the target microarchitecture. + fdevirtualize-at-ltrans Common Var(flag_ltrans_devirtualize) Stream extra data to support more aggressive devirtualization in LTO local transformation mode. diff --git a/gcc/dep-fusion.cc b/gcc/dep-fusion.cc new file mode 100644 index 000000000000..1e69e68dd876 --- /dev/null +++ b/gcc/dep-fusion.cc @@ -0,0 +1,148 @@ +// Dependency fusion reordering pass. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. +// +// This pass uses the RTL-SSA representation to detect def-use pairs that are +// macro-op-fusible in the current microarchitecture (using the +// macro_fusion_pair_p () target hook) and place them next to one another, if +// possible. + +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY +#define INCLUDE_ARRAY +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "df.h" +#include "rtl-ssa.h" +#include "print-rtl.h" +#include "tree-pass.h" +#include "cfgcleanup.h" +#include "target.h" +#include "dbgcnt.h" + +namespace { +const pass_data pass_data_dep_fusion = +{ + RTL_PASS, // type + "dep_fusion", // name + OPTGROUP_NONE, // optinfo_flags + TV_NONE, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish, // todo_flags_finish +}; + +class pass_dep_fusion : public rtl_opt_pass +{ +public: + pass_dep_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_dep_fusion, ctxt) + {} + + // opt_pass methods: + opt_pass *clone () override { return new pass_dep_fusion (m_ctxt); } + bool gate (function *) override; + unsigned int execute (function *) override; +}; + +bool +pass_dep_fusion::gate (function *) +{ + return optimize > 0 && flag_dep_fusion; +} + +unsigned int +pass_dep_fusion::execute (function *fn) +{ + // Initialization. + calculate_dominance_info (CDI_DOMINATORS); + df_analyze (); + crtl->ssa = new rtl_ssa::function_info (fn); + + init_recog_no_volatile (); + + for (rtl_ssa::insn_info *insn = *crtl->ssa->nondebug_insns ().begin (); + insn; + insn = insn->next_nondebug_insn ()) + { + if (!insn->can_be_optimized () || insn->num_defs () != 1) + continue; + + rtl_ssa::set_info *def = single_set_info (insn); + if (!def) + continue; + + rtl_ssa::use_info *use_insn = def->single_nondebug_insn_use (); + if (!use_insn + || !use_insn->insn ()->can_be_optimized () + || !targetm.sched.macro_fusion_pair_p (insn->rtl (), + use_insn->insn ()->rtl ())) + continue; + + auto attempt = crtl->ssa->new_change_attempt (); + rtl_ssa::insn_change change (use_insn->insn ()); + + if (use_insn->insn () != insn->next_any_insn ()) + { + if (!can_move_insn_p (use_insn->insn ())) + continue; + + change.move_range = insn; + if (!rtl_ssa::restrict_movement (change)) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Moved a single-use instruction:\n"); + dump_insn_slim (dump_file, use_insn->insn ()->rtl ()); + fprintf (dump_file, "right after its definition:\n"); + dump_insn_slim (dump_file, insn->rtl ()); + } + } + + SCHED_GROUP_P (use_insn->insn ()->rtl ()) = 1; + confirm_change_group (); + crtl->ssa->change_insn (change); + } + + // Finalization. + if (crtl->ssa->perform_pending_updates ()) + cleanup_cfg (0); + + delete crtl->ssa; + + init_recog (); + free_dominance_info (CDI_DOMINATORS); + return 0; +} + +} // end namespace + +// Create a new dep fusion pass instance. + +rtl_opt_pass * +make_pass_dep_fusion (gcc::context *ctxt) +{ + return new pass_dep_fusion (ctxt); +} diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 164c222a61b5..03a66a46574c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -580,8 +580,8 @@ Objective-C and Objective-C++ Dialects}. -fcse-follow-jumps -fcse-skip-blocks -fcx-fortran-rules -fcx-limited-range -fdata-sections -fdce -fdelayed-branch --fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively --fdevirtualize-at-ltrans -fdse +-fdelete-null-pointer-checks -fdep-fusion -fdevirtualize +-fdevirtualize-speculatively -fdevirtualize-at-ltrans -fdse -fearly-inlining -fipa-sra -fexpensive-optimizations -ffat-lto-objects -ffast-math -ffinite-math-only -ffloat-store -fexcess-precision=@var{style} -ffinite-loops @@ -12794,7 +12794,7 @@ also turns on the following optimization flags: -fcode-hoisting -fcrossjumping -fcse-follow-jumps -fcse-skip-blocks --fdelete-null-pointer-checks +-fdelete-null-pointer-checks -fdep-fusion -fdevirtualize -fdevirtualize-speculatively -fexpensive-optimizations -ffinite-loops @@ -15698,6 +15698,15 @@ more efficiently if they are adjacent to each other in the instruction flow. Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. +@opindex fdep-fusion +@item -fdep-fusion +Detect macro-op fusible pairs consisting of single-use instructions and their +uses, and place such pairs together in the instruction stream to increase +fusion opportunities in hardware. This pass is executed once before register +allocation, and another time before register renaming. + +Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. + @opindex ftracer @item -ftracer Perform tail duplication to enlarge superblock size. This transformation diff --git a/gcc/opts.cc b/gcc/opts.cc index ffcbdfef0bd9..74e4c9fa5ca2 100644 --- a/gcc/opts.cc +++ b/gcc/opts.cc @@ -631,6 +631,7 @@ static const struct default_options default_options_table[] = { OPT_LEVELS_2_PLUS, OPT_fcode_hoisting, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fcrossjumping, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fcse_follow_jumps, NULL, 1 }, + { OPT_LEVELS_2_PLUS, OPT_fdep_fusion, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fdevirtualize, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fdevirtualize_speculatively, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fexpensive_optimizations, NULL, 1 }, diff --git a/gcc/passes.def b/gcc/passes.def index dc4b3b72bd8f..47a07845d8d5 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -508,6 +508,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sched); NEXT_PASS (pass_rtl_avoid_store_forwarding); NEXT_PASS (pass_early_remat); + NEXT_PASS (pass_dep_fusion); NEXT_PASS (pass_ira); NEXT_PASS (pass_reload); /* In the following, some passes are tied to 'pass_postreload' and others @@ -529,6 +530,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sched_fusion); NEXT_PASS (pass_peephole2); NEXT_PASS (pass_if_after_reload); + NEXT_PASS (pass_dep_fusion); NEXT_PASS (pass_regrename); NEXT_PASS (pass_fold_mem_offsets); NEXT_PASS (pass_cprop_hardreg); diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 7cb5a128899a..1ae8f623f9cf 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -623,6 +623,7 @@ extern rtl_opt_pass *make_pass_value_profile_transformations (gcc::context *ctxt); extern rtl_opt_pass *make_pass_postreload_cse (gcc::context *ctxt); extern rtl_opt_pass *make_pass_late_combine (gcc::context *ctxt); +extern rtl_opt_pass *make_pass_dep_fusion (gcc::context *ctxt); extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt); extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt); extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
