Re: [PATCH, rs6000 V2] Add store fusion support for Power10

2021-08-26 Thread Segher Boessenkool
Hi!

On Wed, Aug 11, 2021 at 11:02:25AM -0500, Pat Haugen wrote:
>   * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
>   (POWERPC_MASKS): Likewise.

"Add OPTION_MASK_P10_FUSION_2STORE", instead.

> +static bool
> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
> +{
> +  /* Insn must be a non-prefixed base+disp form store.  */
> +  if (is_store_insn (insn, str_mem)
> +  && get_attr_prefixed (insn) == PREFIXED_NO
> +  && get_attr_update (insn) == UPDATE_NO
> +  && get_attr_indexed (insn) == INDEXED_NO)
> +{
> +  /* Further restictions by mode and size.  */

(typo, "restrictions")

> +  machine_mode mode = GET_MODE (*str_mem);
> +  HOST_WIDE_INT size;
> +  if (MEM_SIZE_KNOWN_P (*str_mem))
> + size = MEM_SIZE (*str_mem);
> +  else
> + return false;

  if (!MEM_SIZE_KNOWN_P (*str_mem))
return false;

  machine_mode mode = GET_MODE (*str_mem);
  HOST_WIDE_INT size = MEM_SIZE (*str_mem);

> +static int
> +power10_sched_reorder (rtx_insn **ready, int lastpos)
> +{
> +  rtx mem1;
> +
> +  /* Do store fusion during sched2 only.  */
> +  if (!reload_completed)
> +return cached_can_issue_more;

Should that just be "return false"?

> +{
> +  int pos;
> +  rtx mem2;
> +
> +  /* A fusable store was just scheduled.  Scan the ready list for another
> +  store that it can fuse with.  */
> +  pos = lastpos;

Declare the var not before here?

> +  while (pos >= 0)
> + {

And mem2 not before in this block?

> +/* { dg-final { scan-assembler-times {stw 4,4\(3\)\n\tstw 6,8\(3\)} 1 { 
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times {stw 4,4\(3\)\n\tstw 6,8\(3\)} 2 { 
> target ilp32 } } } */
> +/* { dg-final { scan-assembler-times {std 4,8\(3\)\n\tstd 6,16\(3\)} 1 { 
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times {stfd 1,8\(3\)\n\tstfd 3,16\(3\)} 1 } } 
> */

This is probably okay because p10 is not supported on any config that
uses register names.

> +/* { dg-final { scan-assembler-not {stw 4,4\(3\)\n\tstw 6,8\(3\)} } } */
> +/* { dg-final { scan-assembler-not {std 4,8\(3\)\n\tstd 6,16\(3\)} { target 
> lp64 } } } */
> +/* { dg-final { scan-assembler-not {stfd 1,8\(3\)\n\tstfd 3,16\(3\)} } } */

Heh.  A little fragile, the compiler could reorder the stores for other
reasons, but the best we can do here I guess.

Okay for trunk with the trivial cleanups.  Thanks!


Segher


Ping: [PATCH, rs6000 V2] Add store fusion support for Power10

2021-08-26 Thread Pat Haugen via Gcc-patches
Ping.

On 8/11/21 11:02 AM, Pat Haugen via Gcc-patches wrote:
> Enable store fusion on Power10.
> 
> Use the SCHED_REORDER hook to implement Power10 specific ready list 
> reordering.
> As of now this is just store fusion.
> 
> Things changed in this version of the patch
> - Separate patch for additional load/store checks
> - Move option check from is_fusable_store() to caller
> - Misc coding style changes pointed out in review (parens/braces)
> - Add testcases
> 
> Bootstrap/regtest on powerpc64(32/64) and powerpc64le(Power10) with no new 
> regressions.
> Ok for master?
> 
> -Pat
> 
> 
> 2021-08-11  Pat Haugen  
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
>   (POWERPC_MASKS): Likewise.
>   * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
>   store fusion for Power10.
>   (is_fusable_store): New.
>   (power10_sched_reorder): Likewise.
>   (rs6000_sched_reorder): Do Power10 specific reordering.
>   (rs6000_sched_reorder2): Likewise.
>   * config/rs6000/rs6000.opt: Add new option.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/powerpc/fusion-p10-stst.c: New test.
>   * gcc.target/powerpc/fusion-p10-stst2.c: New test.
> 
> 
> 
> diff --git a/gcc/config/rs6000/rs6000-cpus.def 
> b/gcc/config/rs6000/rs6000-cpus.def
> index 6758296c0fd..f5812da0184 100644
> --- a/gcc/config/rs6000/rs6000-cpus.def
> +++ b/gcc/config/rs6000/rs6000-cpus.def
> @@ -90,7 +90,8 @@
>| OPTION_MASK_P10_FUSION_2LOGICAL  \
>| OPTION_MASK_P10_FUSION_LOGADD\
>| OPTION_MASK_P10_FUSION_ADDLOG\
> -  | OPTION_MASK_P10_FUSION_2ADD)
> +  | OPTION_MASK_P10_FUSION_2ADD  \
> +  | OPTION_MASK_P10_FUSION_2STORE)
>  
>  /* Flags that need to be turned off if -mno-power9-vector.  */
>  #define OTHER_P9_VECTOR_MASKS(OPTION_MASK_FLOAT128_HW
> \
> @@ -143,6 +144,7 @@
>| OPTION_MASK_P10_FUSION_LOGADD\
>| OPTION_MASK_P10_FUSION_ADDLOG\
>| OPTION_MASK_P10_FUSION_2ADD  \
> +  | OPTION_MASK_P10_FUSION_2STORE\
>| OPTION_MASK_HTM  \
>| OPTION_MASK_ISEL \
>| OPTION_MASK_MFCRF\
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 60f406a4ff6..402cc924e3f 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -4495,6 +4495,10 @@ rs6000_option_override_internal (bool global_init_p)
>&& (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
>  rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
>  
> +  if (TARGET_POWER10
> +  && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
> +rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
> +
>/* Turn off vector pair/mma options on non-power10 systems.  */
>else if (!TARGET_POWER10 && TARGET_MMA)
>  {
> @@ -18874,6 +18878,91 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
>return cached_can_issue_more;
>  }
>  
> +/* Determine if INSN is a store to memory that can be fused with a similar
> +   adjacent store.  */
> +
> +static bool
> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
> +{
> +  /* Insn must be a non-prefixed base+disp form store.  */
> +  if (is_store_insn (insn, str_mem)
> +  && get_attr_prefixed (insn) == PREFIXED_NO
> +  && get_attr_update (insn) == UPDATE_NO
> +  && get_attr_indexed (insn) == INDEXED_NO)
> +{
> +  /* Further restictions by mode and size.  */
> +  machine_mode mode = GET_MODE (*str_mem);
> +  HOST_WIDE_INT size;
> +  if (MEM_SIZE_KNOWN_P (*str_mem))
> + size = MEM_SIZE (*str_mem);
> +  else
> + return false;
> +
> +  if (INTEGRAL_MODE_P (mode))
> + /* Must be word or dword size.  */
> + return (size == 4 || size == 8);
> +  else if (FLOAT_MODE_P (mode))
> + /* Must be dword size.  */
> + return (size == 8);
> +}
> +
> +  return false;
> +}
> +
> +/* Do Power10 specific reordering of the ready list.  */
> +
> +static int
> +power10_sched_reorder (rtx_insn **ready, int lastpos)
> +{
> +  rtx mem1;
> +
> +  /* Do store fusion during sched2 only.  */
> +  if (!reload_completed)
> +return cached_can_issue_more;
> +
> +  /* If the prior insn finished off a store fusion pair then simply
> + reset the counter and return, nothing more to do.  */
> +  if (load_store_pendulum != 0)
> +{
> +  load_store_pendulum = 0;
> +  return cached_can_issue_more;
> +}
> +
> +  /* Try to pair certain store insns to adjacent memory locations
> + so 

[PATCH, rs6000 V2] Add store fusion support for Power10

2021-08-11 Thread Pat Haugen via Gcc-patches
Enable store fusion on Power10.

Use the SCHED_REORDER hook to implement Power10 specific ready list reordering.
As of now this is just store fusion.

Things changed in this version of the patch
- Separate patch for additional load/store checks
- Move option check from is_fusable_store() to caller
- Misc coding style changes pointed out in review (parens/braces)
- Add testcases

Bootstrap/regtest on powerpc64(32/64) and powerpc64le(Power10) with no new 
regressions.
Ok for master?

-Pat


2021-08-11  Pat Haugen  

gcc/ChangeLog:

* config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
(POWERPC_MASKS): Likewise.
* config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
store fusion for Power10.
(is_fusable_store): New.
(power10_sched_reorder): Likewise.
(rs6000_sched_reorder): Do Power10 specific reordering.
(rs6000_sched_reorder2): Likewise.
* config/rs6000/rs6000.opt: Add new option.

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/fusion-p10-stst.c: New test.
* gcc.target/powerpc/fusion-p10-stst2.c: New test.



diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index 6758296c0fd..f5812da0184 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -90,7 +90,8 @@
 | OPTION_MASK_P10_FUSION_2LOGICAL  \
 | OPTION_MASK_P10_FUSION_LOGADD\
 | OPTION_MASK_P10_FUSION_ADDLOG\
-| OPTION_MASK_P10_FUSION_2ADD)
+| OPTION_MASK_P10_FUSION_2ADD  \
+| OPTION_MASK_P10_FUSION_2STORE)
 
 /* Flags that need to be turned off if -mno-power9-vector.  */
 #define OTHER_P9_VECTOR_MASKS  (OPTION_MASK_FLOAT128_HW\
@@ -143,6 +144,7 @@
 | OPTION_MASK_P10_FUSION_LOGADD\
 | OPTION_MASK_P10_FUSION_ADDLOG\
 | OPTION_MASK_P10_FUSION_2ADD  \
+| OPTION_MASK_P10_FUSION_2STORE\
 | OPTION_MASK_HTM  \
 | OPTION_MASK_ISEL \
 | OPTION_MASK_MFCRF\
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 60f406a4ff6..402cc924e3f 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4495,6 +4495,10 @@ rs6000_option_override_internal (bool global_init_p)
   && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
 rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
 
+  if (TARGET_POWER10
+  && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
+rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
+
   /* Turn off vector pair/mma options on non-power10 systems.  */
   else if (!TARGET_POWER10 && TARGET_MMA)
 {
@@ -18874,6 +18878,91 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
   return cached_can_issue_more;
 }
 
+/* Determine if INSN is a store to memory that can be fused with a similar
+   adjacent store.  */
+
+static bool
+is_fusable_store (rtx_insn *insn, rtx *str_mem)
+{
+  /* Insn must be a non-prefixed base+disp form store.  */
+  if (is_store_insn (insn, str_mem)
+  && get_attr_prefixed (insn) == PREFIXED_NO
+  && get_attr_update (insn) == UPDATE_NO
+  && get_attr_indexed (insn) == INDEXED_NO)
+{
+  /* Further restictions by mode and size.  */
+  machine_mode mode = GET_MODE (*str_mem);
+  HOST_WIDE_INT size;
+  if (MEM_SIZE_KNOWN_P (*str_mem))
+   size = MEM_SIZE (*str_mem);
+  else
+   return false;
+
+  if (INTEGRAL_MODE_P (mode))
+   /* Must be word or dword size.  */
+   return (size == 4 || size == 8);
+  else if (FLOAT_MODE_P (mode))
+   /* Must be dword size.  */
+   return (size == 8);
+}
+
+  return false;
+}
+
+/* Do Power10 specific reordering of the ready list.  */
+
+static int
+power10_sched_reorder (rtx_insn **ready, int lastpos)
+{
+  rtx mem1;
+
+  /* Do store fusion during sched2 only.  */
+  if (!reload_completed)
+return cached_can_issue_more;
+
+  /* If the prior insn finished off a store fusion pair then simply
+ reset the counter and return, nothing more to do.  */
+  if (load_store_pendulum != 0)
+{
+  load_store_pendulum = 0;
+  return cached_can_issue_more;
+}
+
+  /* Try to pair certain store insns to adjacent memory locations
+ so that the hardware will fuse them to a single operation.  */
+  if (TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE
+  && is_fusable_store (last_scheduled_insn, ))
+{
+  int pos;
+  rtx mem2;
+
+  /* A fusable store was just scheduled.  Scan the ready list for