[PATCH] aarch64: Account for different Advanced SIMD fusing options

Richard Sandiford via Gcc-patches Thu, 24 Aug 2023 02:20:04 -0700

The scalar FNMADD/FNMSUB and SVE FNMLA/FNMLS instructions mean
that either side of a subtraction can start an accumulator chain.
However, Advanced SIMD doesn't have an equivalent instruction.
This means that, for Advanced SIMD, a subtraction can only be
fused if the second operand is a multiplication.


Also, if both sides of a subtraction are multiplications,
and if the second operand is used multiple times, such as:

     c * d - a * b
     e * f - a * b

then the first rather than second multiplication operand will tend
to be fused.  On Advanced SIMD, this leads to:

     tmp1 = a * b
     tmp2 = -tmp1
      ... = tmp2 + c * d   // FMLA
      ... = tmp2 + e * f   // FMLA

where one of the FMLAs also requires a MOV.

This patch tries to account for this in the vector cost model.
It improves roms performance by 2-3% on Neoverse V1.  It's also
needed to avoid a regression in fotonik for Neoverse N2 and
Neoverse V2 with the patch for PR110625.

Tested on aarch64-linux-gnu & pushed.

Richard


gcc/
        * config/aarch64/aarch64.cc: Include ssa.h.
        (aarch64_multiply_add_p): Require the second operand of an
        Advanced SIMD subtraction to be a multiplication.  Assume that
        such an operation won't be fused if the second operand is used
        multiple times and if the first operand is also a multiplication.

gcc/testsuite/
        * gcc.target/aarch64/neoverse_v1_2.c: New test.
        * gcc.target/aarch64/neoverse_v1_3.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc                 | 24 ++++++++++++++-----
 .../gcc.target/aarch64/neoverse_v1_2.c        | 15 ++++++++++++
 .../gcc.target/aarch64/neoverse_v1_3.c        | 14 +++++++++++
 3 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/neoverse_v1_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/neoverse_v1_3.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 034628148ef..37d414021ca 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -84,6 +84,7 @@
 #include "aarch64-feature-deps.h"
 #include "config/arm/aarch-common.h"
 #include "config/arm/aarch-common-protos.h"
+#include "ssa.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -16411,20 +16412,20 @@ aarch64_multiply_add_p (vec_info *vinfo, 
stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  for (int i = 1; i < 3; ++i)
+  auto is_mul_result = [&](int i)
     {
       tree rhs = gimple_op (assign, i);
       /* ??? Should we try to check for a single use as well?  */
       if (TREE_CODE (rhs) != SSA_NAME)
-       continue;
+       return false;
 
       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
       if (!def_stmt_info
          || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
-       continue;
+       return false;
       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
-       continue;
+       return false;
 
       if (vec_flags & VEC_ADVSIMD)
        {
@@ -16444,8 +16445,19 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
        }
 
       return true;
-    }
-  return false;
+    };
+
+  if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
+    /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
+       multiplication must be on the second operand (to form an FMLS).
+       But if both operands are multiplications and the second operand
+       is used more than once, we'll instead negate the second operand
+       and use it as an accumulator for the first operand.  */
+    return (is_mul_result (2)
+           && (has_single_use (gimple_assign_rhs2 (assign))
+               || !is_mul_result (1)));
+
+  return is_mul_result (1) || is_mul_result (2);
 }
 
 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
diff --git a/gcc/testsuite/gcc.target/aarch64/neoverse_v1_2.c 
b/gcc/testsuite/gcc.target/aarch64/neoverse_v1_2.c
new file mode 100644
index 00000000000..45d7e81c78e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/neoverse_v1_2.c
@@ -0,0 +1,15 @@
+/* { dg-options "-O2 -mcpu=neoverse-v1 --param aarch64-autovec-preference=1 
-fdump-tree-vect-details" } */
+
+void
+f (float x[restrict][100], float y[restrict][100])
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      x[0][i] = y[0][i] * y[1][i] - y[3][i] * y[4][i];
+      x[1][i] = y[1][i] * y[2][i] - y[3][i] * y[4][i];
+    }
+}
+
+/* { dg-final { scan-tree-dump {_[0-9]+ - _[0-9]+ 1 times vector_stmt costs 2 
} "vect" } } */
+/* { dg-final { scan-tree-dump-not {vector_stmt costs 0 } "vect" } } */
+/* { dg-final { scan-tree-dump {_[0-9]+ - _[0-9]+ 1 times scalar_stmt costs 0 
} "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/neoverse_v1_3.c 
b/gcc/testsuite/gcc.target/aarch64/neoverse_v1_3.c
new file mode 100644
index 00000000000..de31fc13b28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/neoverse_v1_3.c
@@ -0,0 +1,14 @@
+/* { dg-options "-O2 -mcpu=neoverse-v1 --param aarch64-autovec-preference=2 
-fdump-tree-vect-details" } */
+
+void
+f (float x[restrict][100], float y[restrict][100])
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      x[0][i] = y[0][i] * y[1][i] - y[3][i] * y[4][i];
+      x[1][i] = y[1][i] * y[2][i] - y[3][i] * y[4][i];
+    }
+}
+
+/* { dg-final { scan-tree-dump {_[0-9]+ - _[0-9]+ 1 times vector_stmt costs 0 
} "vect" } } */
+/* { dg-final { scan-tree-dump {_[0-9]+ - _[0-9]+ 1 times scalar_stmt costs 0 
} "vect" } } */
-- 
2.25.1

[PATCH] aarch64: Account for different Advanced SIMD fusing options

Reply via email to