From e97509382b6bb755336ec4aa220fabd968e69502 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Wed, 16 Oct 2024 04:10:08 -0700
Subject: [PATCH 4/6] aarch64: Optimize vector rotates into REV* instructions
 where possible

Some vector rotate operations can be implemented in a single instruction
rather than using the fallback SHL+USRA sequence.
In particular, when the rotate amount is half the bitwidth of the element
we can use a REV64,REV32,REV16 instruction.
This patch adds this transformation in the recently added splitter for vector
rotates.
Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate):
	Declare prototype.
	* config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement.
	* config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
	Call the above.

gcc/testsuite/

	* gcc.target/aarch64/simd/pr117048_2.c: New test.
---
 gcc/config/aarch64/aarch64-protos.h           |  1 +
 gcc/config/aarch64/aarch64-simd.md            |  3 +
 gcc/config/aarch64/aarch64.cc                 | 49 ++++++++++++++
 .../gcc.target/aarch64/simd/pr117048_2.c      | 66 +++++++++++++++++++
 4 files changed, 119 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index d03c1fe798b..da0e657a513 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -776,6 +776,7 @@ bool aarch64_rnd_imm_p (rtx);
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
 tree aarch64_vector_load_decl (tree);
 rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 543179d9fce..44c40512f30 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1313,6 +1313,9 @@
 	    (match_dup 4))
 	  (match_dup 3)))]
   {
+    if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
+      DONE;
+
     operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
     rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
     int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 21d9a6b5a20..47859c4e31b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15998,6 +15998,55 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
   return true;
 }
 
+/* Emit an optimized sequence to perform a vector rotate
+   of REG by the vector constant amount AMNT and place the result
+   in DST.  Return true iff successful.  */
+
+bool
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+{
+  amnt = unwrap_const_vec_duplicate (amnt);
+  gcc_assert (CONST_INT_P (amnt));
+  HOST_WIDE_INT rotamnt = UINTVAL (amnt);
+  machine_mode mode = GET_MODE (reg);
+  /* Rotates by half the element width map down to REV* instructions.  */
+  if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2)
+    {
+      machine_mode revmode;
+      unsigned unspec;
+      switch (mode)
+	{
+	  case V2DImode:
+	    revmode = V4SImode;
+	    unspec = UNSPEC_REV64;
+	    break;
+	  case V4SImode:
+	    revmode = V8HImode;
+	    unspec = UNSPEC_REV32;
+	    break;
+	  case V2SImode:
+	    revmode = V4HImode;
+	    unspec = UNSPEC_REV32;
+	    break;
+	  /* We can implement a V8HI rotate by 8 with a REV16 instruction but
+	     that is a standard BSWAP code and it won't go through this rotate
+	     optimization path.  */
+	  default:
+	    return false;
+	}
+      rtx rev_reg = lowpart_subreg (revmode, reg, mode);
+      rtx unspec_op
+	= gen_rtx_UNSPEC (revmode, gen_rtvec (1, rev_reg), unspec);
+      rtx tmp_reg
+	= reload_completed ? lowpart_subreg (revmode, dst, mode)
+			   : gen_reg_rtx (revmode);
+      emit_set_insn (tmp_reg, unspec_op);
+      emit_move_insn (dst, lowpart_subreg (mode, tmp_reg, revmode));
+      return true;
+    }
+  return false;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
new file mode 100644
index 00000000000..7821909859d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+/*
+** G1:
+**	rev64	v0\.4s, v0\.4s
+**	ret 
+*/
+v2di
+G1 (v2di r)
+{
+  return (r >> 32) | (r << 32);
+}
+
+/*
+** G2:
+**	rev32	v0\.8h, v0\.8h
+**	ret 
+*/
+v4si
+G2 (v4si r)
+{
+  return (r >> 16) | (r << 16);
+}
+
+/*
+** G3:
+**	rev16	v0\.16b, v0\.16b
+**	ret 
+*/
+v8hi
+G3 (v8hi r)
+{
+  return (r >> 8) | (r << 8);
+}
+
+/*
+** G4:
+**	rev32	v0\.4h, v0\.4h
+**	ret 
+*/
+v2si
+G4 (v2si r)
+{
+  return (r >> 16) | (r << 16);
+}
+
+/*
+** G5:
+**	rev16	v0\.8b, v0\.8b
+**	ret 
+*/
+v4hi
+G5 (v4hi r)
+{
+  return (r >> 8) | (r << 8);
+}
+
-- 
2.44.0