Hi,

here is an enhancement to gcc, which allows load/store groups with size being 
non-power-of-2 to be vectorized.
Current implementation is using interleaving permutations to transform 
load/store groups. That is where power-of-2 requirements comes from.
For N-element vectors simplest approch would be to use N single element 
insertions for any required vector permutation.
And for 2-element vectors it is a reasonable amount of insertions.
Using this approach allows vectorization for cases, which were not supported 
before.

bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu.

Thanks,
Dmitrij
>From acf12c34f4bebbb5c6000a87bf9aaa58e48418bb Mon Sep 17 00:00:00 2001
From: Dmitrij Pochepko <dmitrij.poche...@bell-sw.com>
Date: Wed, 15 Jul 2020 18:07:26 +0300
Subject: [PATCH] non-power-of-2 group size can be vectorized for 2-element
 vectors case (PR96208)

Support for non-power-of-2 group size in vectorizer for 2-element vectors.

gcc/ChangeLog:

2020-07-15      Dmitrij Pochepko <dmitrij.poche...@bell-sw.com>

        PR gcc/96208

        * gcc/tree-vect-data-refs.c:
	(vect_all_2element_permutations_supported): New function
	(vect_permute_load_chain): added new branch with new algo
	(vect_permute_store_chain): Likewise
	(vect_grouped_load_supported): modified logic for new algo
	(vect_grouped_store_supported): Likewise
	(vect_transform_grouped_load): Likewise

gcc/testsuite/ChangeLog:

2020-07-15      Dmitrij Pochepko <dmitrij.poche...@bell-sw.com>

	PR gcc/96208

	* gcc.dg/vect/vect-non-pow2-group.c: New test
---
 gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c |  25 +++
 gcc/tree-vect-data-refs.c                       | 212 +++++++++++++++++++++---
 2 files changed, 218 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c b/gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c
new file mode 100644
index 0000000..7a22739
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-require-effective-target vect_perm } */
+/* { dg-additional-options "-fdump-tree-vect-details -fno-vect-cost-model -Ofast" } */
+
+typedef struct {
+    double m1, m2, m3, m4, m5;
+} the_struct_t;
+
+double bar1 (the_struct_t*);
+
+double foo (double* k, unsigned int n, the_struct_t* the_struct)
+{
+    unsigned int u;
+    the_struct_t result;
+    for (u=0; u < n; u++, k--) {
+	result.m1 += (*k)*the_struct[u].m1;
+	result.m2 += (*k)*the_struct[u].m2;
+	result.m3 += (*k)*the_struct[u].m3;
+	result.m4 += (*k)*the_struct[u].m4;
+    }
+    return bar1 (&result);
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index e35a215..caf4555 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -5027,6 +5027,37 @@ vect_create_destination_var (tree scalar_dest, tree vectype)
   return vec_dest;
 }
 
+/* Function vect_all_2element_permutations_supported
+
+   Returns TRUE if all possible permutations for 2-element
+   vectors are supported for requested mode.  */
+
+bool
+vect_all_2element_permutations_supported (machine_mode mode)
+{
+  // check all possible permutations for 2-element vectors
+  // for 2 vectors it'll be all low and high combinations:
+  // ll={0, 2}, lh={0, 3}, hl={1,2}, hh={1,3}
+  poly_uint64 nelt = GET_MODE_NUNITS (mode);
+  if (!known_eq (nelt, 2ULL))
+    return false;
+  vec_perm_builder sel (nelt, 2, 2);
+  sel.quick_grow (2);
+  sel[0] = 0;
+  sel[1] = 2;
+  vec_perm_indices ll (sel, 2, 2);
+  sel[1] = 3;
+  vec_perm_indices lh (sel, 2, 2);
+  sel[0] = 1;
+  vec_perm_indices hh (sel, 2, 2);
+  sel[1] = 2;
+  vec_perm_indices hl (sel, 2, 2);
+  return can_vec_perm_const_p (mode, ll)
+      && can_vec_perm_const_p (mode, lh)
+      && can_vec_perm_const_p (mode, hl)
+      && can_vec_perm_const_p (mode, hh);
+}
+
 /* Function vect_grouped_store_supported.
 
    Returns TRUE if interleave high and interleave low permutations
@@ -5038,13 +5069,15 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
   machine_mode mode = TYPE_MODE (vectype);
 
   /* vect_permute_store_chain requires the group size to be equal to 3 or
-     be a power of two.  */
-  if (count != 3 && exact_log2 (count) == -1)
+     be a power of two or 2-element vectors to be used.  */
+  if (count != 3 && exact_log2 (count) == -1
+       && !known_eq (GET_MODE_NUNITS (mode), 2ULL))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "the size of the group of accesses"
-			 " is not a power of 2 or not eqaul to 3\n");
+			 " is not a power of 2 or not eqaul to 3"
+			 " and vector element number is not 2\n");
       return false;
     }
 
@@ -5113,9 +5146,14 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
 	    }
 	  return true;
 	}
+      else if (known_eq (GET_MODE_NUNITS (mode), 2ULL))
+	{
+	  return vect_all_2element_permutations_supported (mode);
+	}
       else
 	{
-	  /* If length is not equal to 3 then only power of 2 is supported.  */
+	 /* If length is not equal to 3 and vectors are not 2-elements
+	    then only power of 2 is supported.  */
 	  gcc_assert (pow2p_hwi (count));
 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
 
@@ -5239,6 +5277,8 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
   tree data_ref;
   tree perm3_mask_low, perm3_mask_high;
   unsigned int i, j, n, log_length = exact_log2 (length);
+  poly_uint64 nelt_poly = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned int unelt;
 
   result_chain->quick_grow (length);
   memcpy (result_chain->address (), dr_chain.address (),
@@ -5247,7 +5287,7 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
   if (length == 3)
     {
       /* vect_grouped_store_supported ensures that this is constant.  */
-      unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+      unsigned int nelt = nelt_poly.to_constant ();
       unsigned int j0 = 0, j1 = 0, j2 = 0;
 
       vec_perm_builder sel (nelt, nelt, 1);
@@ -5308,13 +5348,10 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
 	  (*result_chain)[j] = data_ref;
 	}
     }
-  else
+  else if (pow2p_hwi (length))
     {
-      /* If length is not equal to 3 then only power of 2 is supported.  */
-      gcc_assert (pow2p_hwi (length));
-
       /* The encoding has 2 interleaved stepped patterns.  */
-      poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
+      poly_uint64 nelt = nelt_poly;
       vec_perm_builder sel (nelt, 2, 3);
       sel.quick_grow (6);
       for (i = 0; i < 3; i++)
@@ -5360,6 +5397,70 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
 		    length * sizeof (tree));
 	  }
     }
+  else
+    {
+      gcc_assert (known_eq (nelt_poly, 2ULL));
+      unelt = nelt_poly.to_constant ();
+      vec_perm_builder sel (unelt, unelt, 1);
+      sel.quick_grow (unelt);
+      vec_perm_indices indices;
+
+      // perm_x_y == x-th element of 1st vector and y-th element of 2nd vector
+      tree perm_0_0, perm_0_1, perm_1_1;
+      sel[0] = 0;
+      sel[1] = 1 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_0_1 = vect_gen_perm_mask_checked (vectype, indices);
+
+      sel[0] = 0;
+      sel[1] = 0 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_0_0 = vect_gen_perm_mask_checked (vectype, indices);
+
+      sel[0] = 1;
+      sel[1] = 1 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_1_1 = vect_gen_perm_mask_checked (vectype, indices);
+
+      if (length & 1)
+	{
+	  int idx1, idx2;
+	  tree currentPerm;
+	  for (j = 0; j < length; j++)
+	    {
+	      idx1 = (2 * j) % length;
+	      idx2 = (2 * j + 1) % length;
+	      vect1 = dr_chain[idx1];
+	      vect2 = dr_chain[idx2];
+	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm");
+	      if (j < length / 2) currentPerm = perm_0_0;
+	      else if (j == length / 2) currentPerm = perm_0_1;
+	      else currentPerm = perm_1_1;
+	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
+					       vect2, currentPerm);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
+	      (*result_chain)[j] = data_ref;
+	    }
+	}
+      else
+	{
+	  for (i = 0; i < unelt; i++)
+	    {
+	      for (j = 0; j < length / 2; j++)
+		{
+		  vect1 = dr_chain[2 * j];
+		  vect2 = dr_chain[2 * j + 1];
+		  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm");
+		  tree selectedPerm = (i == 0) ? perm_0_0 : perm_1_1;
+		  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+						   vect1, vect2, selectedPerm);
+		  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					       gsi);
+		  (*result_chain)[i * length / 2 + j] = data_ref;
+		}
+	    }
+	}
+    }
 }
 
 /* Function vect_setup_realignment
@@ -5664,12 +5765,15 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
 
   /* vect_permute_load_chain requires the group size to be equal to 3 or
      be a power of two.  */
-  if (count != 3 && exact_log2 (count) == -1)
+  poly_uint64 poly_unelt = GET_MODE_NUNITS (mode);
+  if (count != 3 && exact_log2 (count) == -1
+      && !known_eq (poly_unelt, 2ULL))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "the size of the group of accesses"
-			 " is not a power of 2 or not equal to 3\n");
+			 " is not a power of 2 or not equal to 3"
+			 " and vector elements number is not 2\n");
       return false;
     }
 
@@ -5726,9 +5830,14 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
 	    }
 	  return true;
 	}
+      else if (known_eq (poly_unelt, 2ULL))
+	{
+	  return vect_all_2element_permutations_supported (mode);
+	}
       else
 	{
-	  /* If length is not equal to 3 then only power of 2 is supported.  */
+	  /* If length is not equal to 3 and vector size is not 2
+	  then only power of 2 is supported.  */
 	  gcc_assert (pow2p_hwi (count));
 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
 
@@ -5862,6 +5971,7 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
   gimple *perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   unsigned int i, j, log_length = exact_log2 (length);
+  poly_uint64 nelt_poly = TYPE_VECTOR_SUBPARTS (vectype);
 
   result_chain->quick_grow (length);
   memcpy (result_chain->address (), dr_chain.address (),
@@ -5870,7 +5980,7 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
   if (length == 3)
     {
       /* vect_grouped_load_supported ensures that this is constant.  */
-      unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+      unsigned nelt = nelt_poly.to_constant ();
       unsigned int k;
 
       vec_perm_builder sel (nelt, nelt, 1);
@@ -5917,13 +6027,10 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
 	  (*result_chain)[k] = data_ref;
 	}
     }
-  else
+  else if (pow2p_hwi (length))
     {
-      /* If length is not equal to 3 then only power of 2 is supported.  */
-      gcc_assert (pow2p_hwi (length));
-
       /* The encoding has a single stepped pattern.  */
-      poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
+      poly_uint64 nelt = nelt_poly;
       vec_perm_builder sel (nelt, 1, 3);
       sel.quick_grow (3);
       for (i = 0; i < 3; ++i)
@@ -5963,6 +6070,72 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
 		  length * sizeof (tree));
 	}
     }
+  else
+    {
+      gcc_assert (known_eq (nelt_poly, 2ULL));
+      unsigned int unelt = nelt_poly.to_constant ();
+      vec_perm_builder sel (unelt, unelt, 1);
+      sel.quick_grow (unelt);
+      vec_perm_indices indices;
+
+      // for 2-element vectors there are 2 cases to consider:
+      // 1) result vector Vi is {Vn[j], Vm[k]} and j == k
+      //    which means length is even.
+      // 2) result vector Vi is {Vn[j], Vm[k]} and j != k
+      //    which means length is odd.
+
+      // perm_x_y == x-th element of 1st vector and y-th element of 2nd vector
+      tree perm_0_0, perm_0_1, perm_1_0, perm_1_1;
+      sel[0] = 0;
+      sel[1] = 1 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_0_1 = vect_gen_perm_mask_checked (vectype, indices);
+
+      sel[0] = 1;
+      sel[1] = 0 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_1_0 = vect_gen_perm_mask_checked (vectype, indices);
+
+      sel[0] = 0;
+      sel[1] = 0 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_0_0 = vect_gen_perm_mask_checked (vectype, indices);
+
+      sel[0] = 1;
+      sel[1] = 1 + unelt;
+      indices.new_vector (sel, 2, unelt);
+      perm_1_1 = vect_gen_perm_mask_checked (vectype, indices);
+
+      for (i = 0; i < length; i++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm");
+
+	  first_vect = dr_chain[i / unelt];
+	  second_vect = dr_chain[(i + length) / unelt];
+
+	  tree selectedPerm;
+	  if (length & 1)
+	    { // odd length. different indicies
+	      // odd i, first vector index is 0 and second vector index is 1
+	      // even i, first vector index is 1 and second vector index is 0
+	      selectedPerm = (i & 1) ? perm_1_0 : perm_0_1;
+	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+					       first_vect, second_vect,
+					       selectedPerm);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
+	      (*result_chain)[i] = data_ref;
+	    }
+	  else
+	    { // even length. same indicies (both 0 or both 1)
+	      selectedPerm = (i & 1) ? perm_1_1 : perm_0_0;
+	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+					       first_vect, second_vect,
+					       selectedPerm);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
+	      (*result_chain)[i] = data_ref;
+	    }
+	}
+    }
 }
 
 /* Function vect_shift_permute_load_chain.
@@ -6340,6 +6513,7 @@ vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
       || pow2p_hwi (size)
+      || known_eq (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), 2ULL)
       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
 					 gsi, &result_chain))
     vect_permute_load_chain (vinfo, dr_chain,
-- 
2.7.4

Reply via email to