From 961037c4a069406e9fd0d0bb9d8bdf3a2afbd731 Mon Sep 17 00:00:00 2001
From: Kugan <kvivekanada@nvidia.com>
Date: Sun, 27 Oct 2024 02:24:17 +0530
Subject: [PATCH] [PATCH] Fix SLP when ifcvt versioned loop is not vectorized

When ifcvt version a loop, it sets dont_vectorize to the scalar loop. If the
vector loop is not vectorized and removed, the scalar loop is still left with
dont_vectorize. As a result, BB vectorization will not happen.

This patch adds a new attribute called dont_loop_vectorize (that is different
from general dont_vectorize) specifically for loops versioned. BB vectorization
does not need to honour this and still can vectorize.

gcc/ChangeLog:

	* cfgloop.h: New dont_loop_vectorize in loop.
	* tree-if-conv.cc (version_loop_for_if_conversion): Set
	dont_loop_vectorize.
	(versionable_outer_loop_p): Check dont_loop_vectorize.
	* tree-loop-distribution.cc (version_loop_by_alias_check): Set
	dont_loop_vectorize.
	* tree-vect-loop.cc (vect_transform_loop): Clear dont_loop_vectorize
	for epilogue loop.
	* tree-vectorizer.cc (set_uid_loop_bbs): Set dont_loop_vectorize.
	(try_vectorize_loop_1): Set dont_loop_vectorize.
	(pass_vectorize::execute): Check dont_loop_vectorize.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/bb-slp-77.c: New test.

Signed-off-by: Kugan Vivekanandarajah <kvivekananda@nvidia.com>
---
 gcc/cfgloop.h                         |   3 +
 gcc/testsuite/gcc.dg/vect/bb-slp-77.c |  74 ++++++++++++++++++
 gcc/tree-if-conv.cc                   |   4 +-
 gcc/tree-loop-distribution.cc         |   1 +
 gcc/tree-vect-loop.cc                 |   2 +-
 gcc/tree-vectorizer.cc                | 103 +++++++++++++-------------
 6 files changed, 135 insertions(+), 52 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-77.c

diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 30b5e40d0d9..e2af7ee6da4 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -223,6 +223,9 @@ public:
   /* True if this loop should never be vectorized.  */
   unsigned dont_vectorize : 1;
 
+  /* True if this loop should not be loop vectorized.  */
+  unsigned dont_loop_vectorize : 1;
+
   /* True if we should try harder to vectorize this loop.  */
   unsigned force_vectorize : 1;
 
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c
new file mode 100644
index 00000000000..b2cc1d114f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c
@@ -0,0 +1,74 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+#include <stdint.h>
+#include <string.h>
+
+
+typedef struct {
+    uint16_t d;
+    uint16_t m;
+    uint8_t val1[4];
+    uint8_t val2[16];
+} st1;
+
+typedef struct {
+    float d;
+    float s;
+    int8_t val2[32];
+} st2;
+
+float table[1 << 16];
+
+inline static float foo(uint16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return table[s];
+}
+
+
+void test(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / 32;
+
+    
+    const st1 * restrict x = vx;
+    const st2 * restrict y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t val1;
+        memcpy(&val1, x[i].val1, sizeof(val1));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        if (val1) {
+            for (int j = 0; j < 16; ++j) {
+                const uint8_t xh_0 = ((val1 >> (j)) << 4) & 0x10;
+                const uint8_t xh_1 = ((val1 >> (j + 12)) ) & 0x10;
+
+                const int32_t x0 = (x[i].val2[j] & 0xF) | xh_0;
+                const int32_t x1 = (x[i].val2[j] >> 4) | xh_1;
+
+                sumi0 += (x0 * y[i].val2[j]);
+                sumi1 += (x1 * y[i].val2[j + 16]);
+            }
+        } else {
+            for (int j = 0; j < 16; ++j) {
+                const int32_t x0 = (x[i].val2[j] & 0xF);
+                const int32_t x1 = (x[i].val2[j] >> 4);
+
+                sumi0 += (x0 * y[i].val2[j]);
+                sumi1 += (x1 * y[i].val2[j + 16]);
+            }
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (foo(x[i].d)*y[i].d)*sumi + foo(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1"  { target { { vect_int_mult && vect_element_align } && { ! powerpc*-*-* } } } } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 0346a1376c5..0aa3d76d68d 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -3254,7 +3254,8 @@ version_loop_for_if_conversion (class loop *loop, vec<gimple *> *preds)
   if (new_loop == NULL)
     return NULL;
 
-  new_loop->dont_vectorize = true;
+  new_loop->dont_vectorize = false;
+  new_loop->dont_loop_vectorize = true;
   new_loop->force_vectorize = false;
   gsi = gsi_last_bb (cond_bb);
   gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num));
@@ -3283,6 +3284,7 @@ versionable_outer_loop_p (class loop *loop)
 {
   if (!loop_outer (loop)
       || loop->dont_vectorize
+      || loop->dont_loop_vectorize
       || !loop->inner
       || loop->inner->next
       || !single_exit (loop)
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index f0430ede2f4..674fb21f0ee 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -2829,6 +2829,7 @@ version_loop_by_alias_check (vec<struct partition *> *partitions,
   loop->orig_loop_num = nloop->num;
   nloop->orig_loop_num = nloop->num;
   nloop->dont_vectorize = true;
+  nloop->dont_loop_vectorize = true;
   nloop->force_vectorize = false;
 
   if (call_stmt)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5cd4bdb32e0..1a7d7459d36 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -12666,7 +12666,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
       epilogue->simduid = loop->simduid;
       epilogue->force_vectorize = loop->force_vectorize;
-      epilogue->dont_vectorize = false;
+      epilogue->dont_loop_vectorize = false;
     }
 
   return epilogue;
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
index d4ab47349a3..a69b3551712 100644
--- a/gcc/tree-vectorizer.cc
+++ b/gcc/tree-vectorizer.cc
@@ -961,7 +961,7 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple *loop_vectorized_call,
       if (g)
 	{
 	  arg = gimple_call_arg (g, 0);
-	  get_loop (fun, tree_to_shwi (arg))->dont_vectorize = true;
+	  get_loop (fun, tree_to_shwi (arg))->dont_loop_vectorize = true;
 	  fold_loop_internal_call (g, boolean_false_node);
 	}
     }
@@ -1138,7 +1138,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 	 vectorize LOOP_VECTORIZED guarded inner loop of the scalar
 	 loop version.  */
       if (loop_vectorized_call && loop->inner)
-	loop->inner->dont_vectorize = true;
+	loop->inner->dont_loop_vectorize = true;
       return ret;
     }
 
@@ -1255,53 +1255,56 @@ pass_vectorize::execute (function *fun)
      than all previously defined loops.  This fact allows us to run
      only over initial loops skipping newly generated ones.  */
   for (auto loop : loops_list (fun, 0))
-    if (loop->dont_vectorize)
-      {
-	any_ifcvt_loops = true;
-	/* If-conversion sometimes versions both the outer loop
-	   (for the case when outer loop vectorization might be
-	   desirable) as well as the inner loop in the scalar version
-	   of the loop.  So we have:
-	    if (LOOP_VECTORIZED (1, 3))
-	      {
-		loop1
-		  loop2
-	      }
-	    else
-	      loop3 (copy of loop1)
-		if (LOOP_VECTORIZED (4, 5))
-		  loop4 (copy of loop2)
-		else
-		  loop5 (copy of loop4)
-	   If loops' iteration gives us loop3 first (which has
-	   dont_vectorize set), make sure to process loop1 before loop4;
-	   so that we can prevent vectorization of loop4 if loop1
-	   is successfully vectorized.  */
-	if (loop->inner)
-	  {
-	    gimple *loop_vectorized_call
-	      = vect_loop_vectorized_call (loop);
-	    if (loop_vectorized_call
-		&& vect_loop_vectorized_call (loop->inner))
-	      {
-		tree arg = gimple_call_arg (loop_vectorized_call, 0);
-		class loop *vector_loop
-		  = get_loop (fun, tree_to_shwi (arg));
-		if (vector_loop && vector_loop != loop)
-		  {
-		    /* Make sure we don't vectorize it twice.  */
-		    vector_loop->dont_vectorize = true;
-		    ret |= try_vectorize_loop (simduid_to_vf_htab,
-					       &num_vectorized_loops,
-					       vector_loop, fun);
-		  }
-	      }
-	  }
-      }
-    else
-      ret |= try_vectorize_loop (simduid_to_vf_htab, &num_vectorized_loops,
-				 loop, fun);
-
+    {
+      if (loop->dont_vectorize && !loop->dont_loop_vectorize)
+	continue;
+      if (loop->dont_loop_vectorize)
+	{
+	  any_ifcvt_loops = true;
+	  /* If-conversion sometimes versions both the outer loop
+	     (for the case when outer loop vectorization might be
+	    desirable) as well as the inner loop in the scalar version
+	    of the loop.  So we have:
+	      if (LOOP_VECTORIZED (1, 3))
+		{
+		  loop1
+		    loop2
+		}
+	      else
+		loop3 (copy of loop1)
+		  if (LOOP_VECTORIZED (4, 5))
+		    loop4 (copy of loop2)
+		  else
+		    loop5 (copy of loop4)
+	    If loops' iteration gives us loop3 first (which has
+	    dont_vectorize set), make sure to process loop1 before loop4;
+	    so that we can prevent vectorization of loop4 if loop1
+	    is successfully vectorized.  */
+	  if (loop->inner)
+	    {
+	      gimple *loop_vectorized_call
+		= vect_loop_vectorized_call (loop);
+	      if (loop_vectorized_call
+		  && vect_loop_vectorized_call (loop->inner))
+		{
+		  tree arg = gimple_call_arg (loop_vectorized_call, 0);
+		  class loop *vector_loop
+		    = get_loop (fun, tree_to_shwi (arg));
+		  if (vector_loop && vector_loop != loop)
+		    {
+		      /* Make sure we don't vectorize it twice.  */
+		      vector_loop->dont_loop_vectorize = true;
+		      ret |= try_vectorize_loop (simduid_to_vf_htab,
+						 &num_vectorized_loops,
+						 vector_loop, fun);
+		    }
+		}
+	    }
+	}
+      else
+	ret |= try_vectorize_loop (simduid_to_vf_htab, &num_vectorized_loops,
+				   loop, fun);
+    }
   vect_location = dump_user_location_t ();
 
   statistics_counter_event (fun, "Vectorized loops", num_vectorized_loops);
@@ -1317,7 +1320,7 @@ pass_vectorize::execute (function *fun)
     for (i = 1; i < number_of_loops (fun); i++)
       {
 	class loop *loop = get_loop (fun, i);
-	if (loop && loop->dont_vectorize)
+	if (loop && loop->dont_loop_vectorize)
 	  {
 	    gimple *g = vect_loop_vectorized_call (loop);
 	    if (g)
-- 
2.43.2

