Add a testcase. bootstrap and regression ok for the patch in last mail.
2013-09-09 Wei Mi <[email protected]>
* gcc/testsuite/gcc.dg/macro-fusion-1.c: New.
Index: gcc/testsuite/gcc.dg/macro-fusion-1.c
===================================================================
--- gcc/testsuite/gcc.dg/macro-fusion-1.c (revision 0)
+++ gcc/testsuite/gcc.dg/macro-fusion-1.c (revision 0)
@@ -0,0 +1,14 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -mtune=corei7 -fdump-rtl-sched2" } */
+/* { dg-final { scan-rtl-dump-not
"compare.*insn.*jump_insn.*jump_insn" "sched2" } } */
+
+int a[100];
+
+double bar (double sum)
+{
+ int i;
+ for (i = 0; i < 1000000; i++)
+ sum += (0.5 + (a[i%100] - 128));
+ return sum;
+}
+
On Fri, Sep 6, 2013 at 10:39 AM, Wei Mi <[email protected]> wrote:
> SCHED_GROUP works after I add chain_to_prev_insn after
> add_branch_dependences, in order to chain control dependences to prev
> insn for sched group. Here is the new patch. Testing is going on.
>
> Thanks,
> Wei Mi.
>
> 2013-09-06 Wei Mi <[email protected]>
>
> * config/i386/i386.c (ix86_macro_fusion_p): New function.
> (ix86_macro_fusion_pair_p): Ditto.
> * config/i386/x86-tune.def (DEF_TUNE): Add m_COREI7 for
> X86_TUNE_FUSE_CMP_AND_BRANCH.
> * sched-deps.c (group_insns_for_macro_fusion): New function.
> (sched_analyze_insn): Call group_insns_for_macro_fusion.
> (chain_to_prev_insn): Change it from static to extern.
> (chain_to_prev_insn_p): Ditto.
> * doc/tm.texi: Generated.
> * doc/tm.texi.in: Ditto.
> * sched-int.h: New declarations.
> * sched-rgn.c (add_branch_dependences): Chain control
> dependences to prev insn for sched group.
> * target.def: Add macro_fusion_p and macro_fusion_pair_p.
>
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c (revision 201963)
> +++ config/i386/i386.c (working copy)
> @@ -24850,6 +24850,99 @@ ia32_multipass_dfa_lookahead (void)
> }
> }
>
> +/* Return true if target platform supports macro-fusion. */
> +
> +static bool
> +ix86_macro_fusion_p ()
> +{
> + if (TARGET_FUSE_CMP_AND_BRANCH)
> + return true;
> + else
> + return false;
> +}
> +
> +/* Check whether current microarchitecture support macro fusion
> + for insn pair "CONDGEN + CONDJMP". Refer to
> + "Intel Architectures Optimization Reference Manual". */
> +
> +static bool
> +ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
> +{
> + rtx src;
> + if (!strcmp (ix86_tune_string, "corei7"))
> + {
> + /* For Nehalem. */
> + rtx single_set = single_set (condgen);
> + /* Nehalem doesn't support macro-fusion for add/sub+jmp. */
> + if (single_set == NULL_RTX)
> + return false;
> +
> + src = SET_SRC (single_set);
> + if (GET_CODE (src) != COMPARE)
> + return false;
> +
> + /* Nehalem doesn't support macro-fusion for cmp/test MEM-IMM
> + insn pattern. */
> + if ((MEM_P (XEXP (src, 0))
> + && CONST_INT_P (XEXP (src, 1)))
> + || (MEM_P (XEXP (src, 1))
> + && CONST_INT_P (XEXP (src, 0))))
> + return false;
> +
> + /* Nehalem doesn't support macro-fusion for add/sub/dec/inc + jmp. */
> + if (get_attr_type (condgen) != TYPE_TEST
> + && get_attr_type (condgen) != TYPE_ICMP)
> + return false;
> + return true;
> + }
> + else if (!strcmp (ix86_tune_string, "corei7-avx"))
> + {
> + /* For Sandybridge. */
> + enum rtx_code ccode;
> + rtx compare_set = NULL_RTX, test_if, cond;
> + rtx single_set = single_set (condgen);
> + if (single_set != NULL_RTX)
> + compare_set = single_set;
> + else
> + {
> + int i;
> + rtx pat = PATTERN (condgen);
> + for (i = 0; i < XVECLEN (pat, 0); i++)
> + if (GET_CODE (XVECEXP (pat, 0, i)) == SET
> + && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE)
> + compare_set = XVECEXP (pat, 0, i);
> + }
> +
> + if (compare_set == NULL_RTX)
> + return false;
> + src = SET_SRC (compare_set);
> + if (GET_CODE (src) != COMPARE)
> + return false;
> +
> + /* Sandybridge doesn't support macro-fusion for cmp/test MEM-IMM
> + insn pattern. */
> + if ((MEM_P (XEXP (src, 0))
> + && CONST_INT_P (XEXP (src, 1)))
> + || (MEM_P (XEXP (src, 1))
> + && CONST_INT_P (XEXP (src, 0))))
> + return false;
> +
> + /* Sandybridge doesn't support macro-fusion for inc/dec +
> + unsigned comparison jmp. */
> + test_if = SET_SRC (pc_set (condjmp));
> + cond = XEXP (test_if, 0);
> + ccode = GET_CODE (cond);
> + if (get_attr_type (condgen) == TYPE_INCDEC
> + && (ccode == GEU
> + || ccode == GTU
> + || ccode == LEU
> + || ccode == LTU))
> + return false;
> + return true;
> + }
> + return false;
> +}
> +
> /* Try to reorder ready list to take advantage of Atom pipelined IMUL
> execution. It is applied if
> (1) IMUL instruction is on the top of list;
> @@ -42982,6 +43075,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_
> #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
> #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
> ia32_multipass_dfa_lookahead
> +#undef TARGET_SCHED_MACRO_FUSION_P
> +#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
> +#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
> +#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
>
> #undef TARGET_FUNCTION_OK_FOR_SIBCALL
> #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
> Index: config/i386/x86-tune.def
> ===================================================================
> --- config/i386/x86-tune.def (revision 201963)
> +++ config/i386/x86-tune.def (working copy)
> @@ -196,7 +196,8 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
> /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
> with a subsequent conditional jump instruction into a single
> compare-and-branch uop. */
> -DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
> +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch",
> + m_COREI7 | m_BDVER)
> /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
> will impact LEA instruction selection. */
> DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
> Index: sched-deps.c
> ===================================================================
> --- sched-deps.c (revision 201963)
> +++ sched-deps.c (working copy)
> @@ -487,7 +487,6 @@ static void add_dependence_list (rtx, rt
> static void add_dependence_list_and_free (struct deps_desc *, rtx,
> rtx *, int, enum reg_note, bool);
> static void delete_all_dependences (rtx);
> -static void chain_to_prev_insn (rtx);
>
> static void flush_pending_lists (struct deps_desc *, rtx, int, int);
> static void sched_analyze_1 (struct deps_desc *, rtx, rtx);
> @@ -1660,7 +1659,7 @@ delete_all_dependences (rtx insn)
> chains backwards. Then we add the dependencies for the group to
> the previous nonnote insn. */
>
> -static void
> +void
> chain_to_prev_insn (rtx insn)
> {
> sd_iterator_def sd_it;
> @@ -2821,6 +2820,35 @@ sched_analyze_2 (struct deps_desc *deps,
> sched_deps_info->finish_rhs ();
> }
>
> +/* If the last cond jump and the cond register defining insn are consecutive
> + before scheduling, we want them to be in a schedule group. This is good
> + for performance on microarchitectures supporting macro-fusion. */
> +
> +static void
> +group_insns_for_macro_fusion (rtx insn)
> +{
> + unsigned int condreg1, condreg2;
> + rtx cc_reg_1;
> + rtx prev;
> +
> + targetm.fixed_condition_code_regs (&condreg1, &condreg2);
> + cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
> + prev = prev_nonnote_nondebug_insn (insn);
> + if (!any_condjump_p (insn)
> + || !reg_referenced_p (cc_reg_1, PATTERN (insn))
> + || !prev
> + || !modified_in_p (cc_reg_1, prev))
> + return;
> +
> + /* Different microarchitectures support macro fusions for different
> + combinations of insn pairs. */
> + if (!targetm.sched.macro_fusion_pair_p
> + || !targetm.sched.macro_fusion_pair_p (prev, insn))
> + return;
> +
> + SCHED_GROUP_P (insn) = 1;
> +}
> +
> /* Analyze an INSN with pattern X to find all dependencies. */
> static void
> sched_analyze_insn (struct deps_desc *deps, rtx x, rtx insn)
> @@ -2844,6 +2872,10 @@ sched_analyze_insn (struct deps_desc *de
> can_start_lhs_rhs_p = (NONJUMP_INSN_P (insn)
> && code == SET);
>
> + if (targetm.sched.macro_fusion_p
> + && targetm.sched.macro_fusion_p ())
> + group_insns_for_macro_fusion (insn);
> +
> if (may_trap_p (x))
> /* Avoid moving trapping instructions across function calls that might
> not always return. */
> @@ -3504,7 +3536,7 @@ call_may_noreturn_p (rtx insn)
> group, and if all INSN's dependencies should be moved to the first
> instruction of that group. */
>
> -static bool
> +bool
> chain_to_prev_insn_p (rtx insn)
> {
> rtx prev, x;
> Index: doc/tm.texi
> ===================================================================
> --- doc/tm.texi (revision 201963)
> +++ doc/tm.texi (working copy)
> @@ -6553,6 +6553,17 @@ scheduling one insn causes other insns t
> cycle. These other insns can then be taken into account properly.
> @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
> +This hook is used to check whether target platform supports macro fusion.
> +@end deftypefn
> +
> +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
> @var{condgen}, rtx @var{condjmp})
> +This hook is used to check whether two insns could be macro fused for
> +target microarchitecture. If this hook returns true for the given insn pair
> +(@var{condgen} and @var{condjmp}), scheduler will put them into a sched
> +group, and they will not be scheduled apart.
> +@end deftypefn
> +
> @deftypefn {Target Hook} void
> TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
> @var{tail})
> This hook is called after evaluation forward dependencies of insns in
> chain given by two parameter values (@var{head} and @var{tail}
> Index: doc/tm.texi.in
> ===================================================================
> --- doc/tm.texi.in (revision 201963)
> +++ doc/tm.texi.in (working copy)
> @@ -4940,6 +4940,10 @@ them: try the first ones in this list fi
>
> @hook TARGET_SCHED_REORDER2
>
> +@hook TARGET_SCHED_MACRO_FUSION_P
> +
> +@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
> +
> @hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
>
> @hook TARGET_SCHED_INIT
> Index: sched-int.h
> ===================================================================
> --- sched-int.h (revision 201963)
> +++ sched-int.h (working copy)
> @@ -1302,6 +1302,8 @@ extern void finish_deps_global (void);
> extern void deps_analyze_insn (struct deps_desc *, rtx);
> extern void remove_from_deps (struct deps_desc *, rtx);
> extern void init_insn_reg_pressure_info (rtx);
> +extern bool chain_to_prev_insn_p (rtx insn);
> +extern void chain_to_prev_insn (rtx);
>
> extern dw_t get_dep_weak (ds_t, ds_t);
> extern ds_t set_dep_weak (ds_t, ds_t, dw_t);
> Index: sched-rgn.c
> ===================================================================
> --- sched-rgn.c (revision 201963)
> +++ sched-rgn.c (working copy)
> @@ -2507,7 +2507,7 @@ add_branch_dependences (rtx head, rtx ta
> }
>
> if (!targetm.have_conditional_execution ())
> - return;
> + goto chain_to_prev_insn;
>
> /* Finally, if the block ends in a jump, and we are doing intra-block
> scheduling, make sure that the branch depends on any COND_EXEC insns
> @@ -2543,7 +2543,7 @@ add_branch_dependences (rtx head, rtx ta
> could remove always-true predicates. */
>
> if (!reload_completed || ! (JUMP_P (tail) || JUMP_TABLE_DATA_P (tail)))
> - return;
> + goto chain_to_prev_insn;
>
> insn = tail;
> while (insn != head)
> @@ -2557,6 +2557,23 @@ add_branch_dependences (rtx head, rtx ta
> if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == COND_EXEC)
> add_dependence (tail, insn, REG_DEP_ANTI);
> }
> +
> + chain_to_prev_insn:
> + /* Control dependences also need to be chained to the prev insn
> + for sched group. */
> + insn = tail;
> + while (insn != head)
> + {
> + /* Fixup the dependencies in the sched group. */
> + if (JUMP_P (insn)
> + && chain_to_prev_insn_p (insn)
> + && !sel_sched_p ())
> + chain_to_prev_insn (insn);
> +
> + insn = PREV_INSN (insn);
> + }
> +
> + return;
> }
>
> /* Data structures for the computation of data dependences in a regions. We
> Index: target.def
> ===================================================================
> --- target.def (revision 201963)
> +++ target.def (working copy)
> @@ -1041,6 +1041,19 @@ scheduling one insn causes other insns t
> cycle. These other insns can then be taken into account properly.",
> int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)
>
> +DEFHOOK
> +(macro_fusion_p,
> + "This hook is used to check whether target platform supports macro fusion.",
> + bool, (void), NULL)
> +
> +DEFHOOK
> +(macro_fusion_pair_p,
> + "This hook is used to check whether two insns could be macro fused for\n\
> +target microarchitecture. If this hook returns true for the given insn
> pair\n\
> +(@var{condgen} and @var{condjmp}), scheduler will put them into a sched\n\
> +group, and they will not be scheduled apart.",
> + bool, (rtx condgen, rtx condjmp), NULL)
> +
> /* The following member value is a pointer to a function called
> after evaluation forward dependencies of insns in chain given
> by two parameter values (head and tail correspondingly). */