RE: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass

2023-05-24 Thread Cui, Lili via Gcc-patches
> > +rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> > +const vec
> > +)
> >  {
> >enum tree_code opcode = gimple_assign_rhs_code (stmt);
> >int op_num = ops.length ();
> > @@ -5483,10 +5494,11 @@ rewrite_expr_tree_parallel (gassign *stmt, int
> width,
> >int stmt_num = op_num - 1;
> >gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
> >int op_index = op_num - 1;
> > -  int stmt_index = 0;
> > -  int ready_stmts_end = 0;
> > -  int i = 0;
> > -  gimple *stmt1 = NULL, *stmt2 = NULL;
> > +  int width_count = width;
> > +  int i = 0, j = 0;
> > +  tree tmp_op[2], op1;
> > +  operand_entry *oe;
> > +  gimple *stmt1 = NULL;
> >tree last_rhs1 = gimple_assign_rhs1 (stmt);
> >
> >/* We start expression rewriting from the top statements.
> > @@ -5496,91 +5508,84 @@ rewrite_expr_tree_parallel (gassign *stmt, int
> width,
> >for (i = stmt_num - 2; i >= 0; i--)
> >  stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
> >
> > -  for (i = 0; i < stmt_num; i++)
> > +  /* Build parallel dependency chain according to width.  */  for (i
> > + = 0; i < width; i++)
> >  {
> > -  tree op1, op2;
> > -
> > -  /* Determine whether we should use results of
> > -already handled statements or not.  */
> > -  if (ready_stmts_end == 0
> > - && (i - stmt_index >= width || op_index < 1))
> > -   ready_stmts_end = i;
> > -
> > -  /* Now we choose operands for the next statement.  Non zero
> > -value in ready_stmts_end means here that we should use
> > -the result of already generated statements as new operand.  */
> > -  if (ready_stmts_end > 0)
> > -   {
> > - op1 = gimple_assign_lhs (stmts[stmt_index++]);
> > - if (ready_stmts_end > stmt_index)
> > -   op2 = gimple_assign_lhs (stmts[stmt_index++]);
> > - else if (op_index >= 0)
> > -   {
> > - operand_entry *oe = ops[op_index--];
> > - stmt2 = oe->stmt_to_insert;
> > - op2 = oe->op;
> > -   }
> > - else
> > -   {
> > - gcc_assert (stmt_index < i);
> > - op2 = gimple_assign_lhs (stmts[stmt_index++]);
> > -   }
> > +  /*   */
> 
> empty comment?

Added it, thanks.

> 
> > +  if (op_index > 1 && !has_fma)
> > +   swap_ops_for_binary_stmt (ops, op_index - 2);
> >
> > - if (stmt_index >= ready_stmts_end)
> > -   ready_stmts_end = 0;
> > -   }
> > -  else
> > +  for (j = 0; j < 2; j++)
> > {
> > - if (op_index > 1)
> > -   swap_ops_for_binary_stmt (ops, op_index - 2);
> > - operand_entry *oe2 = ops[op_index--];
> > - operand_entry *oe1 = ops[op_index--];
> > - op2 = oe2->op;
> > - stmt2 = oe2->stmt_to_insert;
> > - op1 = oe1->op;
> > - stmt1 = oe1->stmt_to_insert;
> > + gcc_assert (op_index >= 0);
> > + oe = ops[op_index--];
> > + tmp_op[j] = oe->op;
> > + /* If the stmt that defines operand has to be inserted, insert it
> > +before the use.  */
> > + stmt1 = oe->stmt_to_insert;
> > + if (stmt1)
> > +   insert_stmt_before_use (stmts[i], stmt1);
> > + stmt1 = NULL;
> > }
> > -
> > -  /* If we emit the last statement then we should put
> > -operands into the last statement.  It will also
> > -break the loop.  */
> > -  if (op_index < 0 && stmt_index == i)
> > -   i = stmt_num - 1;
> > +  stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1), tmp_op[1],
> tmp_op[0], opcode);
> > +  gimple_set_visited (stmts[i], true);
> >
> >if (dump_file && (dump_flags & TDF_DETAILS))
> > {
> > - fprintf (dump_file, "Transforming ");
> > + fprintf (dump_file, " into ");
> >   print_gimple_stmt (dump_file, stmts[i], 0);
> > }
> > +}
> >
> > -  /* If the stmt that defines operand has to be inserted, insert it
> > -before the use.  */
> > -  if (stmt1)
> > -   insert_stmt_before_use (stmts[i], stmt1);
> > -  if (stmt2)
> > -   insert_stmt_before_use (stmts[i], stmt2);
> > -  stmt1 = stmt2 = NULL;
> > -
> > -  /* We keep original statement only for the last one.  All
> > -others are recreated.  */
> > -  if (i == stmt_num - 1)
> > +  for (i = width; i < stmt_num; i++)
> > +{
> > +  /* We keep original statement only for the last one.  All others are
> > +recreated.  */
> > +  if ( op_index < 0)
> > {
> > - gimple_assign_set_rhs1 (stmts[i], op1);
> > - gimple_assign_set_rhs2 (stmts[i], op2);
> > - update_stmt (stmts[i]);
> > + if (width_count == 2)
> > +   {
> > +
> > + /* We keep original statement only for the last one.  All
> > +others are recreated.  */
> > + 

Re: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass

2023-05-22 Thread Richard Biener via Gcc-patches
On Wed, May 17, 2023 at 3:02 PM Cui, Lili  wrote:
>
> From: Lili Cui 
>
> Make some changes in reassoc pass to make it more friendly to fma pass later.
> Using FMA instead of mult + add reduces register pressure and insruction
> retired.
>
> There are mainly two changes
> 1. Put no-mult ops and mult ops alternately at the end of the queue, which is
> conducive to generating more fma and reducing the loss of FMA when breaking
> the chain.
> 2. Rewrite the rewrite_expr_tree_parallel function to try to build parallel
> chains according to the given correlation width, keeping the FMA chance as
> much as possible.
>
> TEST1:
>
> float
> foo (float a, float b, float c, float d, float *e)
> {
>return  *e  + a * b + c * d ;
> }
>
> For "-Ofast -mfpmath=sse -mfma" GCC generates:
> vmulss  %xmm3, %xmm2, %xmm2
> vfmadd132ss %xmm1, %xmm2, %xmm0
> vaddss  (%rdi), %xmm0, %xmm0
> ret
>
> With this patch GCC generates:
> vfmadd213ss   (%rdi), %xmm1, %xmm0
> vfmadd231ss   %xmm2, %xmm3, %xmm0
> ret
>
> TEST2:
>
> for (int i = 0; i < N; i++)
> {
>   a[i] += b[i]* c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * l[i] 
> + m[i]* o[i] + p[i];
> }
>
> For "-Ofast -mfpmath=sse -mfma"  GCC generates:
> vmovapd e(%rax), %ymm4
> vmulpd  d(%rax), %ymm4, %ymm3
> addq$32, %rax
> vmovapd c-32(%rax), %ymm5
> vmovapd j-32(%rax), %ymm6
> vmulpd  h-32(%rax), %ymm6, %ymm2
> vmovapd a-32(%rax), %ymm6
> vaddpd  p-32(%rax), %ymm6, %ymm0
> vmovapd g-32(%rax), %ymm7
> vfmadd231pd b-32(%rax), %ymm5, %ymm3
> vmovapd o-32(%rax), %ymm4
> vmulpd  m-32(%rax), %ymm4, %ymm1
> vmovapd l-32(%rax), %ymm5
> vfmadd231pd f-32(%rax), %ymm7, %ymm2
> vfmadd231pd k-32(%rax), %ymm5, %ymm1
> vaddpd  %ymm3, %ymm0, %ymm0
> vaddpd  %ymm2, %ymm0, %ymm0
> vaddpd  %ymm1, %ymm0, %ymm0
> vmovapd %ymm0, a-32(%rax)
> cmpq$8192, %rax
> jne .L4
> vzeroupper
> ret
>
> with this patch applied GCC breaks the chain with width = 2 and generates 6 
> fma:
>
> vmovapd a(%rax), %ymm2
> vmovapd c(%rax), %ymm0
> addq$32, %rax
> vmovapd e-32(%rax), %ymm1
> vmovapd p-32(%rax), %ymm5
> vmovapd g-32(%rax), %ymm3
> vmovapd j-32(%rax), %ymm6
> vmovapd l-32(%rax), %ymm4
> vmovapd o-32(%rax), %ymm7
> vfmadd132pd b-32(%rax), %ymm2, %ymm0
> vfmadd132pd d-32(%rax), %ymm5, %ymm1
> vfmadd231pd f-32(%rax), %ymm3, %ymm0
> vfmadd231pd h-32(%rax), %ymm6, %ymm1
> vfmadd231pd k-32(%rax), %ymm4, %ymm0
> vfmadd231pd m-32(%rax), %ymm7, %ymm1
> vaddpd  %ymm1, %ymm0, %ymm0
> vmovapd %ymm0, a-32(%rax)
> cmpq$8192, %rax
> jne .L2
> vzeroupper
> ret
>
> gcc/ChangeLog:
>
> PR gcc/98350
> * tree-ssa-reassoc.cc
> (rewrite_expr_tree_parallel): Rewrite this function.
> (rank_ops_for_fma): New.
> (reassociate_bb): Handle new function.
>
> gcc/testsuite/ChangeLog:
>
> PR gcc/98350
> * gcc.dg/pr98350-1.c: New test.
> * gcc.dg/pr98350-2.c: Ditto.
> ---
>  gcc/testsuite/gcc.dg/pr98350-1.c |  31 
>  gcc/testsuite/gcc.dg/pr98350-2.c |  11 ++
>  gcc/tree-ssa-reassoc.cc  | 256 +--
>  3 files changed, 215 insertions(+), 83 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr98350-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/pr98350-2.c
>
> diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c 
> b/gcc/testsuite/gcc.dg/pr98350-1.c
> new file mode 100644
> index 000..185511c5e0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr98350-1.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mfpmath=sse -mfma -Wno-attributes " } */
> +
> +/* Test that the compiler properly optimizes multiply and add
> +   to generate more FMA instructions.  */
> +#define N 1024
> +double a[N];
> +double b[N];
> +double c[N];
> +double d[N];
> +double e[N];
> +double f[N];
> +double g[N];
> +double h[N];
> +double j[N];
> +double k[N];
> +double l[N];
> +double m[N];
> +double o[N];
> +double p[N];
> +
> +
> +void
> +foo (void)
> +{
> +  for (int i = 0; i < N; i++)
> +  {
> +a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * 
> l[i] + m[i]* o[i] + p[i];
> +  }
> +}
> +/* { dg-final { scan-assembler-times "vfm" 6  } } */
> diff --git a/gcc/testsuite/gcc.dg/pr98350-2.c 
> b/gcc/testsuite/gcc.dg/pr98350-2.c
> new file mode 100644
> index 000..b35d88aead9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr98350-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mfpmath=sse -mfma -Wno-attributes " } */
> +
> +/* Test that the compiler rearrange the ops to generate more FMA.  */
> +
> +float
> +foo1 

RE: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass

2023-05-18 Thread Cui, Lili via Gcc-patches
Attach CPU2017 3 run results:

On ICX: 
507.cactuBSSN_r: Improved by 1.7% for multi-copy .
503.bwaves_r  : Improved by  0.60% for single copy .
507.cactuBSSN_r : Improved by  1.10% for single copy .
519.lbm_r : Improved by  2.21% for single copy .
no measurable changes for other benchmarks.

On aarch64 
507.cactuBSSN_r: Improved by 1.7% for multi-copy.
503.bwaves_r : Improved by 6.00% for single-copy.
no measurable changes for other benchmarks.

> -Original Message-
> From: Cui, Lili 
> Sent: Wednesday, May 17, 2023 9:02 PM
> To: gcc-patches@gcc.gnu.org
> Cc: richard.guent...@gmail.com; Cui, Lili 
> Subject: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass
> 
> From: Lili Cui 
> 
> Make some changes in reassoc pass to make it more friendly to fma pass
> later.
> Using FMA instead of mult + add reduces register pressure and insruction
> retired.
> 
> There are mainly two changes
> 1. Put no-mult ops and mult ops alternately at the end of the queue, which is
> conducive to generating more fma and reducing the loss of FMA when
> breaking the chain.
> 2. Rewrite the rewrite_expr_tree_parallel function to try to build parallel
> chains according to the given correlation width, keeping the FMA chance as
> much as possible.
> 
> TEST1:
> 
> float
> foo (float a, float b, float c, float d, float *e) {
>return  *e  + a * b + c * d ;
> }
> 
> For "-Ofast -mfpmath=sse -mfma" GCC generates:
> vmulss  %xmm3, %xmm2, %xmm2
> vfmadd132ss %xmm1, %xmm2, %xmm0
> vaddss  (%rdi), %xmm0, %xmm0
> ret
> 
> With this patch GCC generates:
> vfmadd213ss   (%rdi), %xmm1, %xmm0
> vfmadd231ss   %xmm2, %xmm3, %xmm0
> ret
> 
> TEST2:
> 
> for (int i = 0; i < N; i++)
> {
>   a[i] += b[i]* c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * l[i] 
> + m[i]* o[i] +
> p[i]; }
> 
> For "-Ofast -mfpmath=sse -mfma"  GCC generates:
>   vmovapd e(%rax), %ymm4
>   vmulpd  d(%rax), %ymm4, %ymm3
>   addq$32, %rax
>   vmovapd c-32(%rax), %ymm5
>   vmovapd j-32(%rax), %ymm6
>   vmulpd  h-32(%rax), %ymm6, %ymm2
>   vmovapd a-32(%rax), %ymm6
>   vaddpd  p-32(%rax), %ymm6, %ymm0
>   vmovapd g-32(%rax), %ymm7
>   vfmadd231pd b-32(%rax), %ymm5, %ymm3
>   vmovapd o-32(%rax), %ymm4
>   vmulpd  m-32(%rax), %ymm4, %ymm1
>   vmovapd l-32(%rax), %ymm5
>   vfmadd231pd f-32(%rax), %ymm7, %ymm2
>   vfmadd231pd k-32(%rax), %ymm5, %ymm1
>   vaddpd  %ymm3, %ymm0, %ymm0
>   vaddpd  %ymm2, %ymm0, %ymm0
>   vaddpd  %ymm1, %ymm0, %ymm0
>   vmovapd %ymm0, a-32(%rax)
>   cmpq$8192, %rax
>   jne .L4
>   vzeroupper
>   ret
> 
> with this patch applied GCC breaks the chain with width = 2 and generates 6
> fma:
> 
>   vmovapd a(%rax), %ymm2
>   vmovapd c(%rax), %ymm0
>   addq$32, %rax
>   vmovapd e-32(%rax), %ymm1
>   vmovapd p-32(%rax), %ymm5
>   vmovapd g-32(%rax), %ymm3
>   vmovapd j-32(%rax), %ymm6
>   vmovapd l-32(%rax), %ymm4
>   vmovapd o-32(%rax), %ymm7
>   vfmadd132pd b-32(%rax), %ymm2, %ymm0
>   vfmadd132pd d-32(%rax), %ymm5, %ymm1
>   vfmadd231pd f-32(%rax), %ymm3, %ymm0
>   vfmadd231pd h-32(%rax), %ymm6, %ymm1
>   vfmadd231pd k-32(%rax), %ymm4, %ymm0
>   vfmadd231pd m-32(%rax), %ymm7, %ymm1
>   vaddpd  %ymm1, %ymm0, %ymm0
>   vmovapd %ymm0, a-32(%rax)
>   cmpq$8192, %rax
>   jne .L2
>   vzeroupper
>   ret
> 
> gcc/ChangeLog:
> 
>   PR gcc/98350
>   * tree-ssa-reassoc.cc
>   (rewrite_expr_tree_parallel): Rewrite this function.
>   (rank_ops_for_fma): New.
>   (reassociate_bb): Handle new function.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR gcc/98350
>   * gcc.dg/pr98350-1.c: New test.
>   * gcc.dg/pr98350-2.c: Ditto.
> ---
>  gcc/testsuite/gcc.dg/pr98350-1.c |  31   gcc/testsuite/gcc.dg/pr98350-2.c
> |  11 ++
>  gcc/tree-ssa-reassoc.cc  | 256 +--
>  3 files changed, 215 insertions(+), 83 deletions(-)  create mode 100644
> gcc/testsuite/gcc.dg/pr98350-1.c  create mode 100644
> gcc/testsuite/gcc.dg/pr98350-2.c
> 
> diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c b/gcc/testsuite/gcc.dg/pr98350-
> 1.c
> new file mode 100644
> index 000..185511c5e0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr98350-1.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mfpmath=sse -mfma -Wno-attributes " } */
> +
> +/* Test that the compiler properly optimizes multiply and add
> +   to generate more FMA instructions.  */ #define N 1024 double a[N];
> +double b[N]; double c[N]; double d[N]; double e[N]; double f[N]; double
> +g[N]; double h[N]; double j[N]; double k[N]; double l[N]; double m[N];
> +double o[N]; double p[N];
> +
> +
> +void
> +foo (void)
> +{
> +  for (int i = 0; i < N; i++)
> +  {
> +a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] +