The following removes the confusion around num_mask_args that was
added to properly "guess" the number of mask elements in a AVX512
mask that's just represented as int.  The actual mistake lies in
the mixup of 'ncopies' which is used to track the number of
OMP SIMD calls to be emitted rather than the number of input
vectors.  So this reverts the earlier r16-5374-g5c2fdfc24e343c,
uses the proper 'ncopies' for loop mask record/query and adjusts
the guessing of the SIMD arg mask elements.

Bootstrap and regtest on x86_64-unknown-linux-gnu in progress.

I do wonder if there's a way to figure the number of mask
arguments we expect for a SIMD clone?  Consider

#pragma omp declare simd simdlen(32) inbranch
int __attribute__((const)) baz ();

where there's only the mask argument or a case with mixed type
arguments or return?

Thanks,
Richard.

        PR tree-optimization/122762
        PR tree-optimization/122736
        PR tree-optimization/122790
        * tree-vect-stmts.cc (vectorizable_simd_clone_call):
        Remove num_mask_args computation, use a proper ncopies
        to query/register loop masks, adjust code determining
        the number of mask elements in a mask argument.

        * gcc.dg/vect/vect-simd-clone-23.c: New testcase.
---
 .../gcc.dg/vect/vect-simd-clone-23.c          | 17 ++++++
 gcc/tree-vect-stmts.cc                        | 57 +++++++++----------
 2 files changed, 43 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c
new file mode 100644
index 00000000000..312ac9f468f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx512bw" { target avx512bw } } */
+
+#pragma omp declare simd simdlen(32) inbranch
+int __attribute__((const)) baz (int x);
+
+short a[1024];
+
+void __attribute__((noipa))
+foo (int n, int * __restrict b)
+{
+  for (int i = 0; i < n; ++i)
+    if (a[i])
+      b[i] = baz (b[i]);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b8e36d4ee09..dd895f94747 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4339,12 +4339,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   if (bestn == NULL)
     return false;
 
-  unsigned int num_mask_args = 0;
-  for (i = 0; i < bestn->simdclone->nargs; i++)
-    if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
-      num_mask_args++;
-  if (!SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
-    gcc_assert (num_mask_args <= 1);
+  fndecl = bestn->decl;
+  nunits = bestn->simdclone->simdlen;
+  ncopies = vector_unroll_factor (vf * group_size, nunits);
+
+  /* ncopies is the number of SIMD clone calls we create, since simdlen
+     is not necessarily matching nunits of the vector types used, track
+     that in ncopies_in.  */
+  int ncopies_in = vect_get_num_copies (vinfo, slp_node);
 
   for (i = 0; i < nargs; i++)
     {
@@ -4402,8 +4404,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
          else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
            {
              if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
-                 || maybe_ne (exact_div (bestn->simdclone->simdlen,
-                                         num_mask_args),
+                 /* We cannot directly determine the number of mask elements
+                    in one mask argument for a SIMD clone based on the
+                    simdlen alone.  Apply a hopefully restrictive enough
+                    check to fulfil the requirement that vector mask
+                    inputs match up with the SIMD clone call argument
+                    requirements.  */
+                 || ncopies != ncopies_in
+                 || maybe_ne (bestn->simdclone->simdlen,
                               TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
                {
                  /* FORNOW we only have partial support for integer-type masks
@@ -4429,10 +4437,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
        }
     }
 
-  fndecl = bestn->decl;
-  nunits = bestn->simdclone->simdlen;
-  ncopies = vector_unroll_factor (vf * group_size, nunits);
-
   /* If the function isn't const, only allow it in simd loops where user
      has asserted that at least nunits consecutive iterations can be
      performed using SIMD instructions.  */
@@ -4491,20 +4495,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
            case SIMD_CLONE_ARG_TYPE_MASK:
              if (loop_vinfo
                  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-               {
-                 tree arg_vectype;
-                 if (SCALAR_INT_MODE_P
-                       (TYPE_MODE (bestn->simdclone->args[i].vector_type)))
-                   arg_vectype = build_truth_vector_type_for_mode
-                       (exact_div (bestn->simdclone->simdlen, num_mask_args),
-                        TYPE_MODE (bestn->simdclone->args[i].vector_type));
-                 else
-                   arg_vectype = bestn->simdclone->args[i].vector_type;
-                 vect_record_loop_mask (loop_vinfo,
-                                        &LOOP_VINFO_MASKS (loop_vinfo),
-                                        ncopies * num_mask_args, arg_vectype,
-                                        op);
-               }
+               vect_record_loop_mask (loop_vinfo,
+                                      &LOOP_VINFO_MASKS (loop_vinfo),
+                                      ncopies_in, vectype, op);
              break;
            }
        }
@@ -4694,7 +4687,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                                = &LOOP_VINFO_MASKS (loop_vinfo);
                              tree loop_mask
                                = vect_get_loop_mask (loop_vinfo, gsi,
-                                                     loop_masks, ncopies,
+                                                     loop_masks, ncopies_in,
                                                      vectype, j);
                              vec_oprnd0
                                = prepare_vec_mask (loop_vinfo,
@@ -4728,10 +4721,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
              else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
                {
                  atype = bestn->simdclone->args[i].vector_type;
-                 /* Guess the number of lanes represented by atype.  */
+                 /* We cannot directly determine the number of mask elements
+                    in one mask argument for a SIMD clone based on the
+                    simdlen alone.  Use the vector input and rely on
+                    the earlier check that both match up.  */
                  poly_uint64 atype_subparts
-                   = exact_div (bestn->simdclone->simdlen,
-                                num_mask_args);
+                   = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
                  o = vector_unroll_factor (nunits, atype_subparts);
                  for (m = j * o; m < (j + 1) * o; m++)
                    {
@@ -4756,7 +4751,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                                = &LOOP_VINFO_MASKS (loop_vinfo);
                              tree loop_mask
                                = vect_get_loop_mask (loop_vinfo, gsi,
-                                                     loop_masks, ncopies,
+                                                     loop_masks, ncopies_in,
                                                      vectype, j);
                              vec_oprnd0
                                = prepare_vec_mask (loop_vinfo,
-- 
2.51.0

Reply via email to