From 1e1603b7b22429f8359d083d04a55ac888763910 Mon Sep 17 00:00:00 2001
From: root <root@lego-c2-qs-79.ipp2u2.colossus.nvidia.com>
Date: Mon, 13 Oct 2025 23:02:43 -0700
Subject: [PATCH] [PATCH][tree-optimization/61338] - Optimize redundant reverse
 permutations in vectorized stores

This patch eliminates redundant reverse permutations in vectorized reverse
loops by detecting and optimizing patterns during store vectorization.

The reverse load (b[i]) generates PERM, operations are applied, then the
reverse store adds another PERM. This creates redundant permute pairs that
we now detect and eliminate.

With the patch, for the example loop
  for (int i = N - 1; i >= 0; i--)
    {
      a[i] = b[i] + 1.0f;
    }
Changes to the following
-	ldr	q29, [x0, x2]
-	tbl	v29.16b, {v29.16b}, v31.16b
-	fadd	v29.4s, v29.4s, v30.4s
-	tbl	v29.16b, {v29.16b}, v31.16b
-	str	q29, [x3, x2]
+	ldr	q30, [x0, x2]
+	fadd	v30.4s, v30.4s, v31.4s
+	str	q30, [x3, x2]

	PR tree-optimization/61338

gcc/ChangeLog:
	(get_vector_perm_operand): New.
	(vect_find_reverse_permute_operand): New  helper function
	to find reverse permutations through element-wise operation chains.
	Returns true only if ALL operands have reverse permutations.
	(vectorizable_store): Use recursive helper to eliminate redundant
	reverse permutations with configurable search depth.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/slp-permute-reverse-1.c: New test for basic
	reverse permute optimization (simple copy).
	* gcc.dg/vect/slp-permute-reverse-2.c: New runtime test for
	basic pattern.
Signed-off-by: Kugan Vivekanandarajah <kvivekananda@nvidia.com>
---
 gcc/fortran/resolve.cc                        |   2 +-
 .../gcc.dg/vect/slp-permute-reverse-1.c       |  21 +++
 .../gcc.dg/vect/slp-permute-reverse-2.c       |  53 ++++++
 gcc/tree-vect-stmts.cc                        | 159 +++++++++++++++++-
 4 files changed, 230 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c

diff --git a/gcc/fortran/resolve.cc b/gcc/fortran/resolve.cc
index 4c45de08f03..f419f5c7559 100644
--- a/gcc/fortran/resolve.cc
+++ b/gcc/fortran/resolve.cc
@@ -2030,7 +2030,7 @@ static bool
 resolve_actual_arglist (gfc_actual_arglist *arg, procedure_type ptype,
 			bool no_formal_args)
 {
-  gfc_symbol *sym;
+  gfc_symbol *sym = NULL;
   gfc_symtree *parent_st;
   gfc_expr *e;
   gfc_component *comp;
diff --git a/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c
new file mode 100644
index 00000000000..703d201eb97
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details" } */
+
+
+#define N 32000
+
+void test_reverse_loop (float * __restrict__ a, 
+                        float * __restrict__ b)
+{
+  for (int i = N - 1; i >= 0; i--)
+    {
+      a[i] = b[i] + 1.0f;
+    }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
+/* { dg-final { scan-tree-dump "Optimized redundant SLP permutes" "vect" } } */
+/* { dg-final { scan-assembler-times "tbl\\t" 0 { target aarch64*-*-* } } } */
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c
new file mode 100644
index 00000000000..cf8ce512d85
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-O3" } */
+
+/* Runtime test to verify correctness of redundant permute optimization.  */
+
+#define N 1024
+
+float a[N], b[N];
+
+__attribute__((noipa,noinline))
+void test_reverse (void)
+{
+  for (int i = N - 1; i >= 0; i--)
+    a[i] = b[i] + 1.0f;
+}
+
+__attribute__((noipa,noinline))
+void test_forward (void)
+{
+  for (int i = 0; i < N; i++)
+    a[i] = b[i] + 1.0f;
+}
+
+int main ()
+{
+  /* Initialize b array.  */
+  for (int i = 0; i < N; i++)
+    b[i] = (float)i;
+
+  /* Test reverse iteration.  */
+  test_reverse ();
+  
+  /* Verify results.  */
+  for (int i = 0; i < N; i++)
+    if (a[i] != b[i] + 1.0f)
+      __builtin_abort ();
+
+  /* Reset for forward test.  */
+  for (int i = 0; i < N; i++)
+    a[i] = 0.0f;
+
+  test_forward ();
+
+  /* Verify results.  */
+  for (int i = 0; i < N; i++)
+    if (a[i] != b[i] + 1.0f)
+      __builtin_abort ();
+
+  return 0;
+}
+
+
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 59e1d24e8d5..ed735f84c48 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -7755,6 +7755,117 @@ vectorizable_scan_store (vec_info *vinfo, stmt_vec_info stmt_info,
   return true;
 }
 
+/* Helper function to see if OPERAND is derived from a reverse
+   permutation (matching REVERSE_MASK).  If so, return operand before
+   permutation.  Return NULL_TREE otherwise.  */
+
+tree get_vector_perm_operand (tree operand, tree reverse_mask)
+{
+  /* Bail out quickly on non-SSA inputs.  */
+  if (TREE_CODE (operand) != SSA_NAME)
+    return NULL_TREE;
+
+
+  gimple *def = SSA_NAME_DEF_STMT (operand);
+  if (!def || !is_gimple_assign (def))
+    return NULL_TREE;
+
+  tree_code code = gimple_assign_rhs_code (def);
+
+  /* Direct reverse permute - base case.  */
+  if (code == VEC_PERM_EXPR
+      && operand_equal_p (gimple_assign_rhs3 (def), reverse_mask, 0))
+    {
+      return gimple_assign_rhs1 (def);
+    }
+  return NULL_TREE;
+}
+
+/* Helper function to check if OPERAND is derived from a reverse
+   permutation (matching REVERSE_MASK).
+
+   Returns true if ALL operands have reverse permutations and stores the
+   unpermuted result in *UNPERMUTED.  Returns false if ANY operand lacks
+   a reverse permutation.
+
+   The reverse load (b[i]) generates PERM, then operations are applied,
+   then the reverse store adds another PERM.  This function detects and
+   cancels these redundant permute pairs.  */
+
+static bool
+vect_find_reverse_permute_operand (vec_info *vinfo, stmt_vec_info stmt_info,
+				   tree operand, tree reverse_mask,
+				   gimple_stmt_iterator *gsi, tree *unpermuted)
+{
+  *unpermuted = NULL_TREE;
+  if (TREE_CODE (operand) != SSA_NAME)
+    return false;
+
+  gimple *def = SSA_NAME_DEF_STMT (operand);
+  if (!def || !is_gimple_assign (def))
+    return false;
+
+  tree_code code = gimple_assign_rhs_code (def);
+  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (operand)));
+  /* Recurse through element-wise operations.  */
+  if (TREE_CODE_CLASS (code) == tcc_unary)
+    {
+      tree op1 = gimple_assign_rhs1 (def);
+      tree unperm1 = NULL_TREE;
+      /* Check first operand - must have reverse permute.  */
+      if (!(unperm1 = get_vector_perm_operand (op1, reverse_mask)))
+	return false;  /* op1 not reverse-permuted, fail immediately.  */
+      /* All operands are reverse-permuted.  Reconstruct the operation
+	 with unpermuted operands.  */
+      tree vectype = TREE_TYPE (operand);
+      tree new_result = make_ssa_name (vectype);
+      gimple *new_stmt = gimple_build_assign (new_result, code, unperm1);
+
+      /* Insert using vectorizer's statement generation infrastructure.  */
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+      *unpermuted = new_result;
+      return true;
+
+    }
+  else if (TREE_CODE_CLASS (code) == tcc_binary)
+    {
+      tree op1 = gimple_assign_rhs1 (def);
+      tree op2 = TREE_CODE_CLASS (code) == tcc_binary
+	? gimple_assign_rhs2 (def) : NULL_TREE;
+
+      tree unperm1 = NULL_TREE, unperm2 = NULL_TREE;
+
+      if (!(unperm1 = get_vector_perm_operand (op1, reverse_mask)))
+	return false;  /* op1 not reverse-permuted, fail immediately.  */
+
+      /* Check second operand if binary operation.  */
+      if (TREE_CODE (op2) == SSA_NAME)
+	{
+	  /* SSA operand must also have reverse permute.  */
+	  if (!(unperm2 = get_vector_perm_operand (op2, reverse_mask)))
+	    return false;  /* op2 not reverse-permuted, fail.  */
+	}
+      else if (VECTOR_CST_DUPLICATE_P (op2))
+	unperm2 = op2;  /* Constants don't need unpermuting.  */
+      else
+	return false;
+
+      /* All operands are reverse-permuted.  Reconstruct the operation
+	 with unpermuted operands.  */
+      tree vectype = TREE_TYPE (operand);
+      tree new_result = make_ssa_name (vectype);
+      gimple *new_stmt = gimple_build_assign (new_result, code,
+					      unperm1, unperm2);
+
+      /* Insert using vectorizer's statement generation infrastructure.  */
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+      *unpermuted = new_result;
+      return true;
+    }
+
+  return false;
+}
+
 
 /* Function vectorizable_store.
 
@@ -8042,7 +8153,7 @@ vectorizable_store (vec_info *vinfo,
       tree offvar = NULL_TREE;
       tree ivstep;
       tree running_off;
-      tree stride_base, stride_step, alias_off;
+      tree stride_base, stride_step = NULL_TREE, alias_off;
       tree vec_oprnd = NULL_TREE;
       tree dr_offset;
       /* Checked by get_load_store_type.  */
@@ -8935,11 +9046,51 @@ vectorizable_store (vec_info *vinfo,
 	  if (costing_p)
 	    inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
 					     slp_node, 0, vect_body);
+	  else if (TREE_CODE (vec_oprnd) == SSA_NAME)
+	    {
+	      /* Try to optimize redundant reverse permutes recursively.
+		 Only succeeds if ALL operands are reverse-permuted.  */
+	      bool skip_permute = false;
+	      tree reverse_mask = perm_mask_for_reverse (vectype);
+
+	      if (reverse_mask)
+		{
+		  tree unpermuted = NULL_TREE;
+		  if (vect_find_reverse_permute_operand (vinfo, stmt_info,
+							 vec_oprnd,
+							 reverse_mask,
+							 gsi, &unpermuted))
+		    {
+		      vec_oprnd = unpermuted;
+		      skip_permute = true;
+
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "Optimized redundant SLP permutes\n");
+		    }
+		}
+
+	      if (!skip_permute)
+		{
+		  if (!reverse_mask)
+		    reverse_mask = perm_mask_for_reverse (vectype);
+		  tree new_temp = make_ssa_name (vectype);
+		  /* Generate the permute statement.  */
+
+		  gimple *perm_stmt
+		    = gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
+					   vec_oprnd, reverse_mask);
+		  vect_finish_stmt_generation (vinfo, stmt_info,
+					       perm_stmt, gsi);
+
+		  perm_stmt = SSA_NAME_DEF_STMT (new_temp);
+		  vec_oprnd = new_temp;
+		}
+	    }
 	  else
 	    {
 	      tree perm_mask = perm_mask_for_reverse (vectype);
 	      tree new_temp = make_ssa_name (vectype);
-
 	      /* Generate the permute statement.  */
 	      gimple *perm_stmt
 		= gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
@@ -9301,7 +9452,7 @@ vectorizable_load (vec_info *vinfo,
   gphi *phi = NULL;
   vec<tree> dr_chain = vNULL;
   bool grouped_load = false;
-  stmt_vec_info first_stmt_info;
+  stmt_vec_info first_stmt_info = NULL;
   stmt_vec_info first_stmt_info_for_drptr = NULL;
   bool compute_in_loop = false;
   class loop *at_loop;
@@ -9711,7 +9862,7 @@ vectorizable_load (vec_info *vinfo,
       tree ivstep;
       tree running_off;
       vec<constructor_elt, va_gc> *v = NULL;
-      tree stride_base, stride_step, alias_off;
+      tree stride_base, stride_step = NULL_TREE, alias_off;
       /* Checked by get_load_store_type.  */
       unsigned int const_nunits = nunits.to_constant ();
       unsigned HOST_WIDE_INT cst_offset = 0;
-- 
2.34.1

