Re: [PATCH, rs6000 V2] Add store fusion support for Power10
Hi! On Wed, Aug 11, 2021 at 11:02:25AM -0500, Pat Haugen wrote: > * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag. > (POWERPC_MASKS): Likewise. "Add OPTION_MASK_P10_FUSION_2STORE", instead. > +static bool > +is_fusable_store (rtx_insn *insn, rtx *str_mem) > +{ > + /* Insn must be a non-prefixed base+disp form store. */ > + if (is_store_insn (insn, str_mem) > + && get_attr_prefixed (insn) == PREFIXED_NO > + && get_attr_update (insn) == UPDATE_NO > + && get_attr_indexed (insn) == INDEXED_NO) > +{ > + /* Further restictions by mode and size. */ (typo, "restrictions") > + machine_mode mode = GET_MODE (*str_mem); > + HOST_WIDE_INT size; > + if (MEM_SIZE_KNOWN_P (*str_mem)) > + size = MEM_SIZE (*str_mem); > + else > + return false; if (!MEM_SIZE_KNOWN_P (*str_mem)) return false; machine_mode mode = GET_MODE (*str_mem); HOST_WIDE_INT size = MEM_SIZE (*str_mem); > +static int > +power10_sched_reorder (rtx_insn **ready, int lastpos) > +{ > + rtx mem1; > + > + /* Do store fusion during sched2 only. */ > + if (!reload_completed) > +return cached_can_issue_more; Should that just be "return false"? > +{ > + int pos; > + rtx mem2; > + > + /* A fusable store was just scheduled. Scan the ready list for another > + store that it can fuse with. */ > + pos = lastpos; Declare the var not before here? > + while (pos >= 0) > + { And mem2 not before in this block? > +/* { dg-final { scan-assembler-times {stw 4,4\(3\)\n\tstw 6,8\(3\)} 1 { > target lp64 } } } */ > +/* { dg-final { scan-assembler-times {stw 4,4\(3\)\n\tstw 6,8\(3\)} 2 { > target ilp32 } } } */ > +/* { dg-final { scan-assembler-times {std 4,8\(3\)\n\tstd 6,16\(3\)} 1 { > target lp64 } } } */ > +/* { dg-final { scan-assembler-times {stfd 1,8\(3\)\n\tstfd 3,16\(3\)} 1 } } > */ This is probably okay because p10 is not supported on any config that uses register names. > +/* { dg-final { scan-assembler-not {stw 4,4\(3\)\n\tstw 6,8\(3\)} } } */ > +/* { dg-final { scan-assembler-not {std 4,8\(3\)\n\tstd 6,16\(3\)} { target > lp64 } } } */ > +/* { dg-final { scan-assembler-not {stfd 1,8\(3\)\n\tstfd 3,16\(3\)} } } */ Heh. A little fragile, the compiler could reorder the stores for other reasons, but the best we can do here I guess. Okay for trunk with the trivial cleanups. Thanks! Segher
Ping: [PATCH, rs6000 V2] Add store fusion support for Power10
Ping. On 8/11/21 11:02 AM, Pat Haugen via Gcc-patches wrote: > Enable store fusion on Power10. > > Use the SCHED_REORDER hook to implement Power10 specific ready list > reordering. > As of now this is just store fusion. > > Things changed in this version of the patch > - Separate patch for additional load/store checks > - Move option check from is_fusable_store() to caller > - Misc coding style changes pointed out in review (parens/braces) > - Add testcases > > Bootstrap/regtest on powerpc64(32/64) and powerpc64le(Power10) with no new > regressions. > Ok for master? > > -Pat > > > 2021-08-11 Pat Haugen > > gcc/ChangeLog: > > * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag. > (POWERPC_MASKS): Likewise. > * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable > store fusion for Power10. > (is_fusable_store): New. > (power10_sched_reorder): Likewise. > (rs6000_sched_reorder): Do Power10 specific reordering. > (rs6000_sched_reorder2): Likewise. > * config/rs6000/rs6000.opt: Add new option. > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/fusion-p10-stst.c: New test. > * gcc.target/powerpc/fusion-p10-stst2.c: New test. > > > > diff --git a/gcc/config/rs6000/rs6000-cpus.def > b/gcc/config/rs6000/rs6000-cpus.def > index 6758296c0fd..f5812da0184 100644 > --- a/gcc/config/rs6000/rs6000-cpus.def > +++ b/gcc/config/rs6000/rs6000-cpus.def > @@ -90,7 +90,8 @@ >| OPTION_MASK_P10_FUSION_2LOGICAL \ >| OPTION_MASK_P10_FUSION_LOGADD\ >| OPTION_MASK_P10_FUSION_ADDLOG\ > - | OPTION_MASK_P10_FUSION_2ADD) > + | OPTION_MASK_P10_FUSION_2ADD \ > + | OPTION_MASK_P10_FUSION_2STORE) > > /* Flags that need to be turned off if -mno-power9-vector. */ > #define OTHER_P9_VECTOR_MASKS(OPTION_MASK_FLOAT128_HW > \ > @@ -143,6 +144,7 @@ >| OPTION_MASK_P10_FUSION_LOGADD\ >| OPTION_MASK_P10_FUSION_ADDLOG\ >| OPTION_MASK_P10_FUSION_2ADD \ > + | OPTION_MASK_P10_FUSION_2STORE\ >| OPTION_MASK_HTM \ >| OPTION_MASK_ISEL \ >| OPTION_MASK_MFCRF\ > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index 60f406a4ff6..402cc924e3f 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -4495,6 +4495,10 @@ rs6000_option_override_internal (bool global_init_p) >&& (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) > rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; > > + if (TARGET_POWER10 > + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) > +rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; > + >/* Turn off vector pair/mma options on non-power10 systems. */ >else if (!TARGET_POWER10 && TARGET_MMA) > { > @@ -18874,6 +18878,91 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos) >return cached_can_issue_more; > } > > +/* Determine if INSN is a store to memory that can be fused with a similar > + adjacent store. */ > + > +static bool > +is_fusable_store (rtx_insn *insn, rtx *str_mem) > +{ > + /* Insn must be a non-prefixed base+disp form store. */ > + if (is_store_insn (insn, str_mem) > + && get_attr_prefixed (insn) == PREFIXED_NO > + && get_attr_update (insn) == UPDATE_NO > + && get_attr_indexed (insn) == INDEXED_NO) > +{ > + /* Further restictions by mode and size. */ > + machine_mode mode = GET_MODE (*str_mem); > + HOST_WIDE_INT size; > + if (MEM_SIZE_KNOWN_P (*str_mem)) > + size = MEM_SIZE (*str_mem); > + else > + return false; > + > + if (INTEGRAL_MODE_P (mode)) > + /* Must be word or dword size. */ > + return (size == 4 || size == 8); > + else if (FLOAT_MODE_P (mode)) > + /* Must be dword size. */ > + return (size == 8); > +} > + > + return false; > +} > + > +/* Do Power10 specific reordering of the ready list. */ > + > +static int > +power10_sched_reorder (rtx_insn **ready, int lastpos) > +{ > + rtx mem1; > + > + /* Do store fusion during sched2 only. */ > + if (!reload_completed) > +return cached_can_issue_more; > + > + /* If the prior insn finished off a store fusion pair then simply > + reset the counter and return, nothing more to do. */ > + if (load_store_pendulum != 0) > +{ > + load_store_pendulum = 0; > + return cached_can_issue_more; > +} > + > + /* Try to pair certain store insns to adjacent memory locations > + so
[PATCH, rs6000 V2] Add store fusion support for Power10
Enable store fusion on Power10. Use the SCHED_REORDER hook to implement Power10 specific ready list reordering. As of now this is just store fusion. Things changed in this version of the patch - Separate patch for additional load/store checks - Move option check from is_fusable_store() to caller - Misc coding style changes pointed out in review (parens/braces) - Add testcases Bootstrap/regtest on powerpc64(32/64) and powerpc64le(Power10) with no new regressions. Ok for master? -Pat 2021-08-11 Pat Haugen gcc/ChangeLog: * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag. (POWERPC_MASKS): Likewise. * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable store fusion for Power10. (is_fusable_store): New. (power10_sched_reorder): Likewise. (rs6000_sched_reorder): Do Power10 specific reordering. (rs6000_sched_reorder2): Likewise. * config/rs6000/rs6000.opt: Add new option. gcc/testsuite/ChangeLog: * gcc.target/powerpc/fusion-p10-stst.c: New test. * gcc.target/powerpc/fusion-p10-stst2.c: New test. diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 6758296c0fd..f5812da0184 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -90,7 +90,8 @@ | OPTION_MASK_P10_FUSION_2LOGICAL \ | OPTION_MASK_P10_FUSION_LOGADD\ | OPTION_MASK_P10_FUSION_ADDLOG\ -| OPTION_MASK_P10_FUSION_2ADD) +| OPTION_MASK_P10_FUSION_2ADD \ +| OPTION_MASK_P10_FUSION_2STORE) /* Flags that need to be turned off if -mno-power9-vector. */ #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW\ @@ -143,6 +144,7 @@ | OPTION_MASK_P10_FUSION_LOGADD\ | OPTION_MASK_P10_FUSION_ADDLOG\ | OPTION_MASK_P10_FUSION_2ADD \ +| OPTION_MASK_P10_FUSION_2STORE\ | OPTION_MASK_HTM \ | OPTION_MASK_ISEL \ | OPTION_MASK_MFCRF\ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 60f406a4ff6..402cc924e3f 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4495,6 +4495,10 @@ rs6000_option_override_internal (bool global_init_p) && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) +rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; + /* Turn off vector pair/mma options on non-power10 systems. */ else if (!TARGET_POWER10 && TARGET_MMA) { @@ -18874,6 +18878,91 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos) return cached_can_issue_more; } +/* Determine if INSN is a store to memory that can be fused with a similar + adjacent store. */ + +static bool +is_fusable_store (rtx_insn *insn, rtx *str_mem) +{ + /* Insn must be a non-prefixed base+disp form store. */ + if (is_store_insn (insn, str_mem) + && get_attr_prefixed (insn) == PREFIXED_NO + && get_attr_update (insn) == UPDATE_NO + && get_attr_indexed (insn) == INDEXED_NO) +{ + /* Further restictions by mode and size. */ + machine_mode mode = GET_MODE (*str_mem); + HOST_WIDE_INT size; + if (MEM_SIZE_KNOWN_P (*str_mem)) + size = MEM_SIZE (*str_mem); + else + return false; + + if (INTEGRAL_MODE_P (mode)) + /* Must be word or dword size. */ + return (size == 4 || size == 8); + else if (FLOAT_MODE_P (mode)) + /* Must be dword size. */ + return (size == 8); +} + + return false; +} + +/* Do Power10 specific reordering of the ready list. */ + +static int +power10_sched_reorder (rtx_insn **ready, int lastpos) +{ + rtx mem1; + + /* Do store fusion during sched2 only. */ + if (!reload_completed) +return cached_can_issue_more; + + /* If the prior insn finished off a store fusion pair then simply + reset the counter and return, nothing more to do. */ + if (load_store_pendulum != 0) +{ + load_store_pendulum = 0; + return cached_can_issue_more; +} + + /* Try to pair certain store insns to adjacent memory locations + so that the hardware will fuse them to a single operation. */ + if (TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE + && is_fusable_store (last_scheduled_insn, )) +{ + int pos; + rtx mem2; + + /* A fusable store was just scheduled. Scan the ready list for