The following recors both a possibly notinbranch and an inbranch
SIMD clone during analysis so that we can properly handle the
late decision on loop masking. Recording of linear-clause data
from analysis is extended to cover linear-clause arguments from
both clones.
This also fixes AVX512 masked loop code generation in line with
the previous fixes.
Bootstrapped and tested on x86_64-unknown-linux-gnu. Does this look OK?
Thanks,
Richard.
PR tree-optimization/122776
* tree-vectorizer.h (vect_simd_clone_data::clone,
vect_simd_clone_data::clone_inbranch): New fields for
the two selected clones.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Record
both a possibly notinbranch and a inbranch clone. Delay
the choice between both to code generation based on
LOOP_VINFO_FULLY_MASKED_P. Fix masked loop inbranch
code generation for the AVX512 case.
* gcc.dg/vect/vect-simd-clone-24.c: New testcase.
* gcc.dg/gomp/pr110485.c: Adjust.
---
gcc/testsuite/gcc.dg/gomp/pr110485.c | 2 +-
.../gcc.dg/vect/vect-simd-clone-24.c | 22 ++++
gcc/tree-vect-stmts.cc | 102 ++++++++++--------
gcc/tree-vectorizer.h | 6 +-
4 files changed, 86 insertions(+), 46 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
index ba6817a127f..5183f3f403c 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr110485.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -16,4 +16,4 @@ void foo (int n)
}
/* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
-/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because no
masked simd clone was available" "vect" { target x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
new file mode 100644
index 00000000000..081c19bf58f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd --param vect-partial-vector-usage=1
-fdump-tree-dce6" } */
+/* { dg-additional-options "-mavx512f" { target avx512f } } */
+
+#pragma omp declare simd simdlen(16)
+int __attribute__((const)) baz (int x);
+
+int a[1024];
+
+void foo (int n, int * __restrict b)
+{
+ for (int i = 0; i < n; ++i)
+ if (baz (a[i]))
+ b[i] = baz (b[i]);
+}
+
+/* One notinbranch SIMD call, one inbranch in the main vector loop and two
+ inbranch in the masked epilog. */
+/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+\\\)" 1
"dce6" { target avx512f } } } */
+/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\]
\\\(\[^,\]\+,\[^,\]\+\\\)" 3 "dce6" { target avx512f } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a18772f5928..bfed98b8af0 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4219,9 +4219,12 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
unsigned group_size = SLP_TREE_LANES (slp_node);
unsigned int badness = 0;
+ unsigned int badness_inbranch = 0;
struct cgraph_node *bestn = NULL;
+ struct cgraph_node *bestn_inbranch = NULL;
if (!cost_vec)
- bestn = cgraph_node::get (simd_clone_info[0]);
+ bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ ? data.clone_inbranch : data.clone);
else
for (struct cgraph_node *n = node->simd_clones; n != NULL;
n = n->simdclone->next_clone)
@@ -4351,14 +4354,19 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
SIMD_CLONE_ARG_TYPE_MASK);
/* Penalize using a masked SIMD clone in a non-masked loop, that is
not in a branch, as we'd have to construct an all-true mask. */
- if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
- this_badness += 64;
+ this_badness += 64;
}
if (bestn == NULL || this_badness < badness)
{
bestn = n;
badness = this_badness;
}
+ if (n->simdclone->inbranch
+ && (bestn_inbranch == NULL || this_badness < badness_inbranch))
+ {
+ bestn_inbranch = n;
+ badness_inbranch = this_badness;
+ }
}
if (bestn == NULL)
@@ -4394,6 +4402,17 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
"incompatible vector types for invariants\n");
return false;
}
+
+ if (!bestn_inbranch && loop_vinfo)
+ {
+ if (dump_enabled_p ()
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "can't use a fully-masked loop because no"
+ " masked simd clone was available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+
/* When the original call is pure or const but the SIMD ABI dictates
an aggregate return we will have to use a virtual definition and
in a loop eventually even need to add a virtual PHI. That's
@@ -4407,45 +4426,41 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
so automagic virtual operand updating doesn't work. */
if (gimple_vuse (stmt))
vinfo->any_known_not_updated_vssa = true;
- simd_clone_info.safe_push (bestn->decl);
- for (i = 0; i < bestn->simdclone->nargs; i++)
- {
- switch (bestn->simdclone->args[i].arg_type)
- {
- default:
- continue;
- case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
- case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
- {
- simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
- simd_clone_info.safe_push (arginfo[i].op);
- tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
- ? size_type_node : TREE_TYPE (arginfo[i].op);
- tree ls = build_int_cst (lst, arginfo[i].linear_step);
- simd_clone_info.safe_push (ls);
- tree sll = arginfo[i].simd_lane_linear
- ? boolean_true_node : boolean_false_node;
- simd_clone_info.safe_push (sll);
- }
- break;
- case SIMD_CLONE_ARG_TYPE_MASK:
- if (loop_vinfo
- && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
- vect_record_loop_mask (loop_vinfo,
- &LOOP_VINFO_MASKS (loop_vinfo),
- ncopies_in, vectype, op);
- break;
- }
- }
- if (!bestn->simdclone->inbranch && loop_vinfo)
- {
- if (dump_enabled_p ()
- && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
- dump_printf_loc (MSG_NOTE, vect_location,
- "can't use a fully-masked loop because a"
- " non-masked simd clone was selected.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ data.clone = bestn;
+ data.clone_inbranch = bestn_inbranch;
+
+ simd_clone_info.safe_push (NULL_TREE);
+ for (i = 0;
+ i < (bestn_inbranch ? bestn_inbranch : bestn)->simdclone->nargs; i++)
+ {
+ if (loop_vinfo
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && (bestn_inbranch->simdclone->args[i].arg_type
+ == SIMD_CLONE_ARG_TYPE_MASK))
+ vect_record_loop_mask (loop_vinfo,
+ &LOOP_VINFO_MASKS (loop_vinfo),
+ ncopies_in, vectype, op);
+ else if ((bestn->simdclone->args[i].arg_type
+ == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
+ || (bestn->simdclone->args[i].arg_type
+ == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP)
+ || (bestn_inbranch
+ && ((bestn_inbranch->simdclone->args[i].arg_type
+ == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
+ || (bestn_inbranch->simdclone->args[i].arg_type
+ ==
SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))))
+ {
+ simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
+ simd_clone_info.safe_push (arginfo[i].op);
+ tree lst = (POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
+ ? size_type_node : TREE_TYPE (arginfo[i].op));
+ tree ls = build_int_cst (lst, arginfo[i].linear_step);
+ simd_clone_info.safe_push (ls);
+ tree sll = (arginfo[i].simd_lane_linear
+ ? boolean_true_node : boolean_false_node);
+ simd_clone_info.safe_push (sll);
+ }
}
SLP_TREE_TYPE (slp_node) = call_simd_clone_vec_info_type;
@@ -4816,9 +4831,8 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
tree masktype = bestn->simdclone->args[mask_i].vector_type;
if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
- /* Guess the number of lanes represented by masktype. */
callee_nelements = exact_div (bestn->simdclone->simdlen,
- bestn->simdclone->nargs - nargs);
+
bestn->simdclone->args[i].linear_step);
else
callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
o = vector_unroll_factor (nunits, callee_nelements);
@@ -4828,7 +4842,7 @@ vectorizable_simd_clone_call (vec_info *vinfo,
stmt_vec_info stmt_info,
{
vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
- ncopies, masktype, j);
+ ncopies_in, vectype, j);
}
else
mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 55f0bee0eb7..3b264a6102c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -265,8 +265,12 @@ struct vect_simd_clone_data : vect_data {
vect_simd_clone_data () = default;
vect_simd_clone_data (vect_simd_clone_data &&other) = default;
+ /* Selected SIMD clone and clone for in-branch. */
+ cgraph_node *clone;
+ cgraph_node *clone_inbranch;
+
/* Selected SIMD clone's function info. First vector element
- is SIMD clone's function decl, followed by a pair of trees (base + step)
+ is NULL_TREE, followed by a pair of trees (base + step)
for linear arguments (pair of NULLs for other arguments). */
auto_vec<tree> simd_clone_info;
};
--
2.51.0