RE: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass
> > +rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma, > > +const vec > > +) > > { > >enum tree_code opcode = gimple_assign_rhs_code (stmt); > >int op_num = ops.length (); > > @@ -5483,10 +5494,11 @@ rewrite_expr_tree_parallel (gassign *stmt, int > width, > >int stmt_num = op_num - 1; > >gimple **stmts = XALLOCAVEC (gimple *, stmt_num); > >int op_index = op_num - 1; > > - int stmt_index = 0; > > - int ready_stmts_end = 0; > > - int i = 0; > > - gimple *stmt1 = NULL, *stmt2 = NULL; > > + int width_count = width; > > + int i = 0, j = 0; > > + tree tmp_op[2], op1; > > + operand_entry *oe; > > + gimple *stmt1 = NULL; > >tree last_rhs1 = gimple_assign_rhs1 (stmt); > > > >/* We start expression rewriting from the top statements. > > @@ -5496,91 +5508,84 @@ rewrite_expr_tree_parallel (gassign *stmt, int > width, > >for (i = stmt_num - 2; i >= 0; i--) > > stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1])); > > > > - for (i = 0; i < stmt_num; i++) > > + /* Build parallel dependency chain according to width. */ for (i > > + = 0; i < width; i++) > > { > > - tree op1, op2; > > - > > - /* Determine whether we should use results of > > -already handled statements or not. */ > > - if (ready_stmts_end == 0 > > - && (i - stmt_index >= width || op_index < 1)) > > - ready_stmts_end = i; > > - > > - /* Now we choose operands for the next statement. Non zero > > -value in ready_stmts_end means here that we should use > > -the result of already generated statements as new operand. */ > > - if (ready_stmts_end > 0) > > - { > > - op1 = gimple_assign_lhs (stmts[stmt_index++]); > > - if (ready_stmts_end > stmt_index) > > - op2 = gimple_assign_lhs (stmts[stmt_index++]); > > - else if (op_index >= 0) > > - { > > - operand_entry *oe = ops[op_index--]; > > - stmt2 = oe->stmt_to_insert; > > - op2 = oe->op; > > - } > > - else > > - { > > - gcc_assert (stmt_index < i); > > - op2 = gimple_assign_lhs (stmts[stmt_index++]); > > - } > > + /* */ > > empty comment? Added it, thanks. > > > + if (op_index > 1 && !has_fma) > > + swap_ops_for_binary_stmt (ops, op_index - 2); > > > > - if (stmt_index >= ready_stmts_end) > > - ready_stmts_end = 0; > > - } > > - else > > + for (j = 0; j < 2; j++) > > { > > - if (op_index > 1) > > - swap_ops_for_binary_stmt (ops, op_index - 2); > > - operand_entry *oe2 = ops[op_index--]; > > - operand_entry *oe1 = ops[op_index--]; > > - op2 = oe2->op; > > - stmt2 = oe2->stmt_to_insert; > > - op1 = oe1->op; > > - stmt1 = oe1->stmt_to_insert; > > + gcc_assert (op_index >= 0); > > + oe = ops[op_index--]; > > + tmp_op[j] = oe->op; > > + /* If the stmt that defines operand has to be inserted, insert it > > +before the use. */ > > + stmt1 = oe->stmt_to_insert; > > + if (stmt1) > > + insert_stmt_before_use (stmts[i], stmt1); > > + stmt1 = NULL; > > } > > - > > - /* If we emit the last statement then we should put > > -operands into the last statement. It will also > > -break the loop. */ > > - if (op_index < 0 && stmt_index == i) > > - i = stmt_num - 1; > > + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1), tmp_op[1], > tmp_op[0], opcode); > > + gimple_set_visited (stmts[i], true); > > > >if (dump_file && (dump_flags & TDF_DETAILS)) > > { > > - fprintf (dump_file, "Transforming "); > > + fprintf (dump_file, " into "); > > print_gimple_stmt (dump_file, stmts[i], 0); > > } > > +} > > > > - /* If the stmt that defines operand has to be inserted, insert it > > -before the use. */ > > - if (stmt1) > > - insert_stmt_before_use (stmts[i], stmt1); > > - if (stmt2) > > - insert_stmt_before_use (stmts[i], stmt2); > > - stmt1 = stmt2 = NULL; > > - > > - /* We keep original statement only for the last one. All > > -others are recreated. */ > > - if (i == stmt_num - 1) > > + for (i = width; i < stmt_num; i++) > > +{ > > + /* We keep original statement only for the last one. All others are > > +recreated. */ > > + if ( op_index < 0) > > { > > - gimple_assign_set_rhs1 (stmts[i], op1); > > - gimple_assign_set_rhs2 (stmts[i], op2); > > - update_stmt (stmts[i]); > > + if (width_count == 2) > > + { > > + > > + /* We keep original statement only for the last one. All > > +others are recreated. */ > > +
Re: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass
On Wed, May 17, 2023 at 3:02 PM Cui, Lili wrote: > > From: Lili Cui > > Make some changes in reassoc pass to make it more friendly to fma pass later. > Using FMA instead of mult + add reduces register pressure and insruction > retired. > > There are mainly two changes > 1. Put no-mult ops and mult ops alternately at the end of the queue, which is > conducive to generating more fma and reducing the loss of FMA when breaking > the chain. > 2. Rewrite the rewrite_expr_tree_parallel function to try to build parallel > chains according to the given correlation width, keeping the FMA chance as > much as possible. > > TEST1: > > float > foo (float a, float b, float c, float d, float *e) > { >return *e + a * b + c * d ; > } > > For "-Ofast -mfpmath=sse -mfma" GCC generates: > vmulss %xmm3, %xmm2, %xmm2 > vfmadd132ss %xmm1, %xmm2, %xmm0 > vaddss (%rdi), %xmm0, %xmm0 > ret > > With this patch GCC generates: > vfmadd213ss (%rdi), %xmm1, %xmm0 > vfmadd231ss %xmm2, %xmm3, %xmm0 > ret > > TEST2: > > for (int i = 0; i < N; i++) > { > a[i] += b[i]* c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * l[i] > + m[i]* o[i] + p[i]; > } > > For "-Ofast -mfpmath=sse -mfma" GCC generates: > vmovapd e(%rax), %ymm4 > vmulpd d(%rax), %ymm4, %ymm3 > addq$32, %rax > vmovapd c-32(%rax), %ymm5 > vmovapd j-32(%rax), %ymm6 > vmulpd h-32(%rax), %ymm6, %ymm2 > vmovapd a-32(%rax), %ymm6 > vaddpd p-32(%rax), %ymm6, %ymm0 > vmovapd g-32(%rax), %ymm7 > vfmadd231pd b-32(%rax), %ymm5, %ymm3 > vmovapd o-32(%rax), %ymm4 > vmulpd m-32(%rax), %ymm4, %ymm1 > vmovapd l-32(%rax), %ymm5 > vfmadd231pd f-32(%rax), %ymm7, %ymm2 > vfmadd231pd k-32(%rax), %ymm5, %ymm1 > vaddpd %ymm3, %ymm0, %ymm0 > vaddpd %ymm2, %ymm0, %ymm0 > vaddpd %ymm1, %ymm0, %ymm0 > vmovapd %ymm0, a-32(%rax) > cmpq$8192, %rax > jne .L4 > vzeroupper > ret > > with this patch applied GCC breaks the chain with width = 2 and generates 6 > fma: > > vmovapd a(%rax), %ymm2 > vmovapd c(%rax), %ymm0 > addq$32, %rax > vmovapd e-32(%rax), %ymm1 > vmovapd p-32(%rax), %ymm5 > vmovapd g-32(%rax), %ymm3 > vmovapd j-32(%rax), %ymm6 > vmovapd l-32(%rax), %ymm4 > vmovapd o-32(%rax), %ymm7 > vfmadd132pd b-32(%rax), %ymm2, %ymm0 > vfmadd132pd d-32(%rax), %ymm5, %ymm1 > vfmadd231pd f-32(%rax), %ymm3, %ymm0 > vfmadd231pd h-32(%rax), %ymm6, %ymm1 > vfmadd231pd k-32(%rax), %ymm4, %ymm0 > vfmadd231pd m-32(%rax), %ymm7, %ymm1 > vaddpd %ymm1, %ymm0, %ymm0 > vmovapd %ymm0, a-32(%rax) > cmpq$8192, %rax > jne .L2 > vzeroupper > ret > > gcc/ChangeLog: > > PR gcc/98350 > * tree-ssa-reassoc.cc > (rewrite_expr_tree_parallel): Rewrite this function. > (rank_ops_for_fma): New. > (reassociate_bb): Handle new function. > > gcc/testsuite/ChangeLog: > > PR gcc/98350 > * gcc.dg/pr98350-1.c: New test. > * gcc.dg/pr98350-2.c: Ditto. > --- > gcc/testsuite/gcc.dg/pr98350-1.c | 31 > gcc/testsuite/gcc.dg/pr98350-2.c | 11 ++ > gcc/tree-ssa-reassoc.cc | 256 +-- > 3 files changed, 215 insertions(+), 83 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/pr98350-1.c > create mode 100644 gcc/testsuite/gcc.dg/pr98350-2.c > > diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c > b/gcc/testsuite/gcc.dg/pr98350-1.c > new file mode 100644 > index 000..185511c5e0a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/pr98350-1.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -mfpmath=sse -mfma -Wno-attributes " } */ > + > +/* Test that the compiler properly optimizes multiply and add > + to generate more FMA instructions. */ > +#define N 1024 > +double a[N]; > +double b[N]; > +double c[N]; > +double d[N]; > +double e[N]; > +double f[N]; > +double g[N]; > +double h[N]; > +double j[N]; > +double k[N]; > +double l[N]; > +double m[N]; > +double o[N]; > +double p[N]; > + > + > +void > +foo (void) > +{ > + for (int i = 0; i < N; i++) > + { > +a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * > l[i] + m[i]* o[i] + p[i]; > + } > +} > +/* { dg-final { scan-assembler-times "vfm" 6 } } */ > diff --git a/gcc/testsuite/gcc.dg/pr98350-2.c > b/gcc/testsuite/gcc.dg/pr98350-2.c > new file mode 100644 > index 000..b35d88aead9 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/pr98350-2.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -mfpmath=sse -mfma -Wno-attributes " } */ > + > +/* Test that the compiler rearrange the ops to generate more FMA. */ > + > +float > +foo1
RE: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass
Attach CPU2017 3 run results: On ICX: 507.cactuBSSN_r: Improved by 1.7% for multi-copy . 503.bwaves_r : Improved by 0.60% for single copy . 507.cactuBSSN_r : Improved by 1.10% for single copy . 519.lbm_r : Improved by 2.21% for single copy . no measurable changes for other benchmarks. On aarch64 507.cactuBSSN_r: Improved by 1.7% for multi-copy. 503.bwaves_r : Improved by 6.00% for single-copy. no measurable changes for other benchmarks. > -Original Message- > From: Cui, Lili > Sent: Wednesday, May 17, 2023 9:02 PM > To: gcc-patches@gcc.gnu.org > Cc: richard.guent...@gmail.com; Cui, Lili > Subject: [PATCH] PR gcc/98350:Handle FMA friendly in reassoc pass > > From: Lili Cui > > Make some changes in reassoc pass to make it more friendly to fma pass > later. > Using FMA instead of mult + add reduces register pressure and insruction > retired. > > There are mainly two changes > 1. Put no-mult ops and mult ops alternately at the end of the queue, which is > conducive to generating more fma and reducing the loss of FMA when > breaking the chain. > 2. Rewrite the rewrite_expr_tree_parallel function to try to build parallel > chains according to the given correlation width, keeping the FMA chance as > much as possible. > > TEST1: > > float > foo (float a, float b, float c, float d, float *e) { >return *e + a * b + c * d ; > } > > For "-Ofast -mfpmath=sse -mfma" GCC generates: > vmulss %xmm3, %xmm2, %xmm2 > vfmadd132ss %xmm1, %xmm2, %xmm0 > vaddss (%rdi), %xmm0, %xmm0 > ret > > With this patch GCC generates: > vfmadd213ss (%rdi), %xmm1, %xmm0 > vfmadd231ss %xmm2, %xmm3, %xmm0 > ret > > TEST2: > > for (int i = 0; i < N; i++) > { > a[i] += b[i]* c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * l[i] > + m[i]* o[i] + > p[i]; } > > For "-Ofast -mfpmath=sse -mfma" GCC generates: > vmovapd e(%rax), %ymm4 > vmulpd d(%rax), %ymm4, %ymm3 > addq$32, %rax > vmovapd c-32(%rax), %ymm5 > vmovapd j-32(%rax), %ymm6 > vmulpd h-32(%rax), %ymm6, %ymm2 > vmovapd a-32(%rax), %ymm6 > vaddpd p-32(%rax), %ymm6, %ymm0 > vmovapd g-32(%rax), %ymm7 > vfmadd231pd b-32(%rax), %ymm5, %ymm3 > vmovapd o-32(%rax), %ymm4 > vmulpd m-32(%rax), %ymm4, %ymm1 > vmovapd l-32(%rax), %ymm5 > vfmadd231pd f-32(%rax), %ymm7, %ymm2 > vfmadd231pd k-32(%rax), %ymm5, %ymm1 > vaddpd %ymm3, %ymm0, %ymm0 > vaddpd %ymm2, %ymm0, %ymm0 > vaddpd %ymm1, %ymm0, %ymm0 > vmovapd %ymm0, a-32(%rax) > cmpq$8192, %rax > jne .L4 > vzeroupper > ret > > with this patch applied GCC breaks the chain with width = 2 and generates 6 > fma: > > vmovapd a(%rax), %ymm2 > vmovapd c(%rax), %ymm0 > addq$32, %rax > vmovapd e-32(%rax), %ymm1 > vmovapd p-32(%rax), %ymm5 > vmovapd g-32(%rax), %ymm3 > vmovapd j-32(%rax), %ymm6 > vmovapd l-32(%rax), %ymm4 > vmovapd o-32(%rax), %ymm7 > vfmadd132pd b-32(%rax), %ymm2, %ymm0 > vfmadd132pd d-32(%rax), %ymm5, %ymm1 > vfmadd231pd f-32(%rax), %ymm3, %ymm0 > vfmadd231pd h-32(%rax), %ymm6, %ymm1 > vfmadd231pd k-32(%rax), %ymm4, %ymm0 > vfmadd231pd m-32(%rax), %ymm7, %ymm1 > vaddpd %ymm1, %ymm0, %ymm0 > vmovapd %ymm0, a-32(%rax) > cmpq$8192, %rax > jne .L2 > vzeroupper > ret > > gcc/ChangeLog: > > PR gcc/98350 > * tree-ssa-reassoc.cc > (rewrite_expr_tree_parallel): Rewrite this function. > (rank_ops_for_fma): New. > (reassociate_bb): Handle new function. > > gcc/testsuite/ChangeLog: > > PR gcc/98350 > * gcc.dg/pr98350-1.c: New test. > * gcc.dg/pr98350-2.c: Ditto. > --- > gcc/testsuite/gcc.dg/pr98350-1.c | 31 gcc/testsuite/gcc.dg/pr98350-2.c > | 11 ++ > gcc/tree-ssa-reassoc.cc | 256 +-- > 3 files changed, 215 insertions(+), 83 deletions(-) create mode 100644 > gcc/testsuite/gcc.dg/pr98350-1.c create mode 100644 > gcc/testsuite/gcc.dg/pr98350-2.c > > diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c b/gcc/testsuite/gcc.dg/pr98350- > 1.c > new file mode 100644 > index 000..185511c5e0a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/pr98350-1.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -mfpmath=sse -mfma -Wno-attributes " } */ > + > +/* Test that the compiler properly optimizes multiply and add > + to generate more FMA instructions. */ #define N 1024 double a[N]; > +double b[N]; double c[N]; double d[N]; double e[N]; double f[N]; double > +g[N]; double h[N]; double j[N]; double k[N]; double l[N]; double m[N]; > +double o[N]; double p[N]; > + > + > +void > +foo (void) > +{ > + for (int i = 0; i < N; i++) > + { > +a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] +