The following recors both a possibly notinbranch and an inbranch
SIMD clone during analysis so that we can properly handle the
late decision on loop masking.  Recording of linear-clause data
from analysis is extended to cover linear-clause arguments from
both clones.

This also fixes AVX512 masked loop code generation in line with
the previous fixes.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Does this look OK?

Thanks,
Richard.

        PR tree-optimization/122776
        * tree-vectorizer.h (vect_simd_clone_data::clone,
        vect_simd_clone_data::clone_inbranch): New fields for
        the two selected clones.
        * tree-vect-stmts.cc (vectorizable_simd_clone_call): Record
        both a possibly notinbranch and a inbranch clone.  Delay
        the choice between both to code generation based on
        LOOP_VINFO_FULLY_MASKED_P.  Fix masked loop inbranch
        code generation for the AVX512 case.

        * gcc.dg/vect/vect-simd-clone-24.c: New testcase.
        * gcc.dg/gomp/pr110485.c: Adjust.
---
 gcc/testsuite/gcc.dg/gomp/pr110485.c          |   2 +-
 .../gcc.dg/vect/vect-simd-clone-24.c          |  22 ++++
 gcc/tree-vect-stmts.cc                        | 102 ++++++++++--------
 gcc/tree-vectorizer.h                         |   6 +-
 4 files changed, 86 insertions(+), 46 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c

diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c 
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
index ba6817a127f..5183f3f403c 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr110485.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -16,4 +16,4 @@ void foo (int n)
 }
 
 /* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
-/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a 
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because no 
masked simd clone was available" "vect" { target x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
new file mode 100644
index 00000000000..081c19bf58f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd --param vect-partial-vector-usage=1 
-fdump-tree-dce6" } */
+/* { dg-additional-options "-mavx512f" { target avx512f } } */
+
+#pragma omp declare simd simdlen(16)
+int __attribute__((const)) baz (int x);
+
+int a[1024];
+
+void foo (int n, int * __restrict b)
+{
+  for (int i = 0; i < n; ++i)
+    if (baz (a[i]))
+      b[i] = baz (b[i]);
+}
+
+/* One notinbranch SIMD call, one inbranch in the main vector loop and two
+   inbranch in the masked epilog.  */
+/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+\\\)" 1 
"dce6" { target avx512f } } } */
+/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] 
\\\(\[^,\]\+,\[^,\]\+\\\)" 3 "dce6" { target avx512f } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a18772f5928..bfed98b8af0 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4219,9 +4219,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
   unsigned group_size = SLP_TREE_LANES (slp_node);
   unsigned int badness = 0;
+  unsigned int badness_inbranch = 0;
   struct cgraph_node *bestn = NULL;
+  struct cgraph_node *bestn_inbranch = NULL;
   if (!cost_vec)
-    bestn = cgraph_node::get (simd_clone_info[0]);
+    bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+            ? data.clone_inbranch : data.clone);
   else
     for (struct cgraph_node *n = node->simd_clones; n != NULL;
         n = n->simdclone->next_clone)
@@ -4351,14 +4354,19 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                        SIMD_CLONE_ARG_TYPE_MASK);
            /* Penalize using a masked SIMD clone in a non-masked loop, that is
               not in a branch, as we'd have to construct an all-true mask.  */
-           if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-             this_badness += 64;
+           this_badness += 64;
          }
        if (bestn == NULL || this_badness < badness)
          {
            bestn = n;
            badness = this_badness;
          }
+       if (n->simdclone->inbranch
+           && (bestn_inbranch == NULL || this_badness < badness_inbranch))
+         {
+           bestn_inbranch = n;
+           badness_inbranch = this_badness;
+         }
       }
 
   if (bestn == NULL)
@@ -4394,6 +4402,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                               "incompatible vector types for invariants\n");
            return false;
          }
+
+      if (!bestn_inbranch && loop_vinfo)
+       {
+         if (dump_enabled_p ()
+             && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "can't use a fully-masked loop because no"
+                            " masked simd clone was available.\n");
+         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+       }
+
       /* When the original call is pure or const but the SIMD ABI dictates
         an aggregate return we will have to use a virtual definition and
         in a loop eventually even need to add a virtual PHI.  That's
@@ -4407,45 +4426,41 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
         so automagic virtual operand updating doesn't work.  */
       if (gimple_vuse (stmt))
        vinfo->any_known_not_updated_vssa = true;
-      simd_clone_info.safe_push (bestn->decl);
-      for (i = 0; i < bestn->simdclone->nargs; i++)
-       {
-         switch (bestn->simdclone->args[i].arg_type)
-           {
-           default:
-             continue;
-           case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
-           case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
-             {
-               simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
-               simd_clone_info.safe_push (arginfo[i].op);
-               tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
-                          ? size_type_node : TREE_TYPE (arginfo[i].op);
-               tree ls = build_int_cst (lst, arginfo[i].linear_step);
-               simd_clone_info.safe_push (ls);
-               tree sll = arginfo[i].simd_lane_linear
-                          ? boolean_true_node : boolean_false_node;
-               simd_clone_info.safe_push (sll);
-             }
-             break;
-           case SIMD_CLONE_ARG_TYPE_MASK:
-             if (loop_vinfo
-                 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-               vect_record_loop_mask (loop_vinfo,
-                                      &LOOP_VINFO_MASKS (loop_vinfo),
-                                      ncopies_in, vectype, op);
-             break;
-           }
-       }
 
-      if (!bestn->simdclone->inbranch && loop_vinfo)
-       {
-         if (dump_enabled_p ()
-             && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-           dump_printf_loc (MSG_NOTE, vect_location,
-                            "can't use a fully-masked loop because a"
-                            " non-masked simd clone was selected.\n");
-         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+      data.clone = bestn;
+      data.clone_inbranch = bestn_inbranch;
+
+      simd_clone_info.safe_push (NULL_TREE);
+      for (i = 0;
+          i < (bestn_inbranch ? bestn_inbranch : bestn)->simdclone->nargs; i++)
+       {
+         if (loop_vinfo
+             && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+             && (bestn_inbranch->simdclone->args[i].arg_type
+                 == SIMD_CLONE_ARG_TYPE_MASK))
+           vect_record_loop_mask (loop_vinfo,
+                                  &LOOP_VINFO_MASKS (loop_vinfo),
+                                  ncopies_in, vectype, op);
+         else if ((bestn->simdclone->args[i].arg_type
+                   == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
+                  || (bestn->simdclone->args[i].arg_type
+                      == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP)
+                  || (bestn_inbranch
+                      && ((bestn_inbranch->simdclone->args[i].arg_type
+                           == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
+                          || (bestn_inbranch->simdclone->args[i].arg_type
+                              == 
SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))))
+           {
+             simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
+             simd_clone_info.safe_push (arginfo[i].op);
+             tree lst = (POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
+                         ? size_type_node : TREE_TYPE (arginfo[i].op));
+             tree ls = build_int_cst (lst, arginfo[i].linear_step);
+             simd_clone_info.safe_push (ls);
+             tree sll = (arginfo[i].simd_lane_linear
+                         ? boolean_true_node : boolean_false_node);
+             simd_clone_info.safe_push (sll);
+           }
        }
 
       SLP_TREE_TYPE (slp_node) = call_simd_clone_vec_info_type;
@@ -4816,9 +4831,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
          tree masktype = bestn->simdclone->args[mask_i].vector_type;
          if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
-           /* Guess the number of lanes represented by masktype.  */
            callee_nelements = exact_div (bestn->simdclone->simdlen,
-                                         bestn->simdclone->nargs - nargs);
+                                         
bestn->simdclone->args[i].linear_step);
          else
            callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
          o = vector_unroll_factor (nunits, callee_nelements);
@@ -4828,7 +4842,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                {
                  vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
                  mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-                                            ncopies, masktype, j);
+                                            ncopies_in, vectype, j);
                }
              else
                mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 55f0bee0eb7..3b264a6102c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -265,8 +265,12 @@ struct vect_simd_clone_data : vect_data {
   vect_simd_clone_data () = default;
   vect_simd_clone_data (vect_simd_clone_data &&other) = default;
 
+  /* Selected SIMD clone and clone for in-branch.  */
+  cgraph_node *clone;
+  cgraph_node *clone_inbranch;
+
   /* Selected SIMD clone's function info.  First vector element
-     is SIMD clone's function decl, followed by a pair of trees (base + step)
+     is NULL_TREE, followed by a pair of trees (base + step)
      for linear arguments (pair of NULLs for other arguments).  */
   auto_vec<tree> simd_clone_info;
 };
-- 
2.51.0

Reply via email to