From 9800fcccd24be02cf4f733eaa1136fc0201d3ea5 Mon Sep 17 00:00:00 2001
From: Jennifer Schmitz <jschmitz@nvidia.com>
Date: Mon, 23 Sep 2024 06:52:46 -0700
Subject: [PATCH] [PR113816] AArch64: Use SVE bit op reduction for vector
 reductions

This patch implements the optabs reduc_and_scal_<mode>,
reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for Advanced SIMD
integers for TARGET_SVE in order to use the SVE instructions ANDV, ORV, and
EORV for fixed-width bitwise reductions.
For example, the test case

int32_t foo (int32_t *a)
{
  int32_t b = -1;
  for (int i = 0; i < 4; ++i)
    b &= a[i];
  return b;
}

was previously compiled to
(-O2 -ftree-vectorize --param aarch64-autovec-preference=asimd-only):
foo:
        ldp     w2, w1, [x0]
        ldp     w3, w0, [x0, 8]
        and     w1, w1, w3
        and     w0, w0, w2
        and     w0, w1, w0
        ret

With patch, it is compiled to:
foo:
        ldr     q31, [x0]
	ptrue   p7.b, all
	andv    s31, p7, z31.s
	fmov    w0, s3
	ret

Test cases were added to check the produced assembly for use of SVE
instructions.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	PR target/113816
	* config/aarch64/aarch64-sve.md (reduc_<optab>_scal_<mode>): For
	logical reductions, match operand 1 with VQ_I.
	(*aarch64_pred_reduc_<optab>_<mode>): Likewise.
	* config/aarch64/iterators.md (SVE_INT_REDUCTION_LOGICAL): New iterator
	for logical reduction expressions.

gcc/testsuite
	PR target/113816
	* gcc.target/aarch64/sve/logical_reduc.c: New test.
---
 gcc/config/aarch64/aarch64-sve.md             | 22 +++++++
 gcc/config/aarch64/iterators.md               |  4 ++
 .../gcc.target/aarch64/sve/logical_reduc.c    | 58 +++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/logical_reduc.c
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bfa28849adf..0d9e5cebef0 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8927,6 +8927,28 @@
   "<su>addv\t%d0, %1, %2.<Vetype>"
 )
 
+;; Unpredicated logical integer reductions for Advanced SIMD modes.
+(define_expand "reduc_<optab>_scal_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(unspec:<VEL> [(match_dup 2)
+		       (match_operand:VQ_I 1 "register_operand")]
+		      SVE_INT_REDUCTION_LOGICAL))]
+  "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated logical integer reductions for Advanced SIMD modes.
+(define_insn "*aarch64_pred_reduc_<optab>_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+		       (match_operand:VQ_I 2 "register_operand" "w")]
+		      SVE_INT_REDUCTION_LOGICAL))]
+  "TARGET_SVE"
+  "<sve_int_op>\t%<Vetype>0, %1, %Z2.<Vetype>"
+)
+
 ;; Unpredicated integer reductions.
 (define_expand "reduc_<optab>_scal_<mode>"
   [(set (match_operand:<VEL> 0 "register_operand")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 20a318e023b..ad707f93f42 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3036,6 +3036,10 @@
    (UNSPEC_BFMLSLB "TARGET_SME2 && TARGET_STREAMING_SME")
    (UNSPEC_BFMLSLT "TARGET_SME2 && TARGET_STREAMING_SME")])
 
+(define_int_iterator SVE_INT_REDUCTION_LOGICAL [UNSPEC_ANDV
+						UNSPEC_IORV
+						UNSPEC_XORV])
+
 (define_int_iterator SVE_INT_REDUCTION [UNSPEC_ANDV
 					UNSPEC_IORV
 					UNSPEC_SMAXV
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/logical_reduc.c b/gcc/testsuite/gcc.target/aarch64/sve/logical_reduc.c
new file mode 100644
index 00000000000..6ad9eea610e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/logical_reduc.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-autovec-preference=asimd-only" } */
+
+#include <stdint.h>
+
+#define BIT_AND(TYPE, N)				\
+  TYPE __attribute__ ((noinline))			\
+  f_and_##TYPE (TYPE *restrict a)			\
+  {							\
+    TYPE b = -1;					\
+    for (int i = 0; i < N; ++i)				\
+      b &= a[i];					\
+    return b;						\
+  }
+      
+#define BIT_OR(TYPE, N)					\
+  TYPE __attribute__ ((noinline))			\
+  f_or_##TYPE (TYPE *restrict a)			\
+  {							\
+    TYPE b = 0;						\
+    for (int i = 0; i < N; ++i)				\
+      b |= a[i];					\
+    return b;						\
+  }
+      
+#define BIT_XOR(TYPE, N)				\
+  TYPE __attribute__ ((noinline))			\
+  f_xor_##TYPE (TYPE *restrict a)			\
+  {							\
+    TYPE b = 0;						\
+    for (int i = 0; i < N; ++i)				\
+      b ^= a[i];					\
+    return b;						\
+  }
+      
+#define TEST_OP(T)					\
+  T (int8_t, 16)					\
+  T (int16_t, 8)					\
+  T (int32_t, 4)					\
+  T (uint8_t, 16)					\
+  T (uint16_t, 8)					\
+  T (uint32_t, 4)
+
+TEST_OP(BIT_AND)
+TEST_OP(BIT_OR)
+TEST_OP(BIT_XOR)
+
+/* { dg-final { scan-assembler-times {\tandv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2  } } */
+/* { dg-final { scan-assembler-times {\tandv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2  } } */
+/* { dg-final { scan-assembler-times {\tandv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2  } } */
+
+/* { dg-final { scan-assembler-times {\torv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2  } } */
+/* { dg-final { scan-assembler-times {\torv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2  } } */
+/* { dg-final { scan-assembler-times {\torv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2  } } */
+
+/* { dg-final { scan-assembler-times {\teorv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2  } } */
+/* { dg-final { scan-assembler-times {\teorv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2  } } */
+/* { dg-final { scan-assembler-times {\teorv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2  } } */
-- 
2.44.0