The following recors both a possibly notinbranch and an inbranch
SIMD clone during analysis so that we can properly handle the
late decision on loop masking. I'm not sure to which extent things
like SIMD_CLONE_ARG_TYPE_LINEAR_* and friends are uniform amongst
all SIMD clones for a function. If there can be actual differences
that would result in different recordings in simd_clone_info[]
we'd have to duplicate those as well.
This also fixes AVX512 masked loop code generation in line with
the previous fixes. I'll split that out in case there are
diffculties with the rest.
OK for trunk?
Thanks,
Richard.
PR tree-optimization/122776
* tree-vectorizer.h (vect_simd_clone_data::clone,
vect_simd_clone_data::clone_inbranch): New fields for
the two selected clones.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Record
both a possibly notinbranch and a inbranch clone. Delay
the choice between both to code generation based on
LOOP_VINFO_FULLY_MASKED_P. Fix masked loop inbranch
code generation for the AVX512 case.
* gcc.dg/vect/vect-simd-clone-24.c: New testcase.
* gcc.dg/gomp/pr110485.c: Adjust.
---
gcc/testsuite/gcc.dg/gomp/pr110485.c | 2 +-
.../gcc.dg/vect/vect-simd-clone-24.c | 22 ++++++++++++++
gcc/tree-vect-stmts.cc | 29 ++++++++++++-------
gcc/tree-vectorizer.h | 6 +++-
4 files changed, 47 insertions(+), 12 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
index ba6817a127f..5183f3f403c 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr110485.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -16,4 +16,4 @@ void foo (int n)
}
/* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
-/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because no
masked simd clone was available" "vect" { target x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
new file mode 100644
index 00000000000..081c19bf58f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd --param vect-partial-vector-usage=1
-fdump-tree-dce6" } */
+/* { dg-additional-options "-mavx512f" { target avx512f } } */
+
+#pragma omp declare simd simdlen(16)
+int __attribute__((const)) baz (int x);
+
+int a[1024];
+
+void foo (int n, int * __restrict b)
+{
+ for (int i = 0; i < n; ++i)
+ if (baz (a[i]))
+ b[i] = baz (b[i]);
+}
+
+/* One notinbranch SIMD call, one inbranch in the main vector loop and two
+ inbranch in the masked epilog. */
+/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+\\\)" 1
"dce6" { target avx512f } } } */
+/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\]
\\\(\[^,\]\+,\[^,\]\+\\\)" 3 "dce6" { target avx512f } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a18772f5928..3121797d67d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4219,9 +4219,12 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
unsigned group_size = SLP_TREE_LANES (slp_node);
unsigned int badness = 0;
+ unsigned int badness_inbranch = 0;
struct cgraph_node *bestn = NULL;
+ struct cgraph_node *bestn_inbranch = NULL;
if (!cost_vec)
- bestn = cgraph_node::get (simd_clone_info[0]);
+ bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ ? data.clone_inbranch : data.clone);
else
for (struct cgraph_node *n = node->simd_clones; n != NULL;
n = n->simdclone->next_clone)
@@ -4351,14 +4354,19 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
SIMD_CLONE_ARG_TYPE_MASK);
/* Penalize using a masked SIMD clone in a non-masked loop, that is
not in a branch, as we'd have to construct an all-true mask. */
- if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
- this_badness += 64;
+ this_badness += 64;
}
if (bestn == NULL || this_badness < badness)
{
bestn = n;
badness = this_badness;
}
+ if (n->simdclone->inbranch
+ && (bestn_inbranch == NULL || this_badness < badness_inbranch))
+ {
+ bestn_inbranch = n;
+ badness_inbranch = this_badness;
+ }
}
if (bestn == NULL)
@@ -4407,7 +4415,9 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
so automagic virtual operand updating doesn't work. */
if (gimple_vuse (stmt))
vinfo->any_known_not_updated_vssa = true;
- simd_clone_info.safe_push (bestn->decl);
+ data.clone = bestn;
+ data.clone_inbranch = bestn_inbranch;
+ simd_clone_info.safe_push (NULL_TREE);
for (i = 0; i < bestn->simdclone->nargs; i++)
{
switch (bestn->simdclone->args[i].arg_type)
@@ -4438,13 +4448,13 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
}
}
- if (!bestn->simdclone->inbranch && loop_vinfo)
+ if (!bestn_inbranch && loop_vinfo)
{
if (dump_enabled_p ()
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
dump_printf_loc (MSG_NOTE, vect_location,
- "can't use a fully-masked loop because a"
- " non-masked simd clone was selected.\n");
+ "can't use a fully-masked loop because no"
+ " masked simd clone was available.\n");
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
@@ -4816,9 +4826,8 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
tree masktype = bestn->simdclone->args[mask_i].vector_type;
if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
- /* Guess the number of lanes represented by masktype. */
callee_nelements = exact_div (bestn->simdclone->simdlen,
- bestn->simdclone->nargs - nargs);
+
bestn->simdclone->args[i].linear_step);
else
callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
o = vector_unroll_factor (nunits, callee_nelements);
@@ -4828,7 +4837,7 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
{
vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
- ncopies, masktype, j);
+ ncopies_in, vectype, j);
}
else
mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 55f0bee0eb7..3b264a6102c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -265,8 +265,12 @@ struct vect_simd_clone_data : vect_data {
vect_simd_clone_data () = default;
vect_simd_clone_data (vect_simd_clone_data &&other) = default;
+ /* Selected SIMD clone and clone for in-branch. */
+ cgraph_node *clone;
+ cgraph_node *clone_inbranch;
+
/* Selected SIMD clone's function info. First vector element
- is SIMD clone's function decl, followed by a pair of trees (base + step)
+ is NULL_TREE, followed by a pair of trees (base + step)
for linear arguments (pair of NULLs for other arguments). */
auto_vec<tree> simd_clone_info;
};
--
2.51.0