On Mon, 24 Nov 2025, Richard Biener wrote:
> The following recors both a possibly notinbranch and an inbranch
> SIMD clone during analysis so that we can properly handle the
> late decision on loop masking. Recording of linear-clause data
> from analysis is extended to cover linear-clause arguments from
> both clones.
>
> This also fixes AVX512 masked loop code generation in line with
> the previous fixes.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu. Does this look OK?
I have split out and pushed the last two hunks of the
vectorizable_simd_clone_call part as correctness fix, so this now
remains as missed-optimization fix.
Richard.
> Thanks,
> Richard.
>
> PR tree-optimization/122776
> * tree-vectorizer.h (vect_simd_clone_data::clone,
> vect_simd_clone_data::clone_inbranch): New fields for
> the two selected clones.
> * tree-vect-stmts.cc (vectorizable_simd_clone_call): Record
> both a possibly notinbranch and a inbranch clone. Delay
> the choice between both to code generation based on
> LOOP_VINFO_FULLY_MASKED_P. Fix masked loop inbranch
> code generation for the AVX512 case.
>
> * gcc.dg/vect/vect-simd-clone-24.c: New testcase.
> * gcc.dg/gomp/pr110485.c: Adjust.
> ---
> gcc/testsuite/gcc.dg/gomp/pr110485.c | 2 +-
> .../gcc.dg/vect/vect-simd-clone-24.c | 22 ++++
> gcc/tree-vect-stmts.cc | 102 ++++++++++--------
> gcc/tree-vectorizer.h | 6 +-
> 4 files changed, 86 insertions(+), 46 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
>
> diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c
> b/gcc/testsuite/gcc.dg/gomp/pr110485.c
> index ba6817a127f..5183f3f403c 100644
> --- a/gcc/testsuite/gcc.dg/gomp/pr110485.c
> +++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
> @@ -16,4 +16,4 @@ void foo (int n)
> }
>
> /* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
> -/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a
> non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
> +/* { dg-final { scan-tree-dump "can't use a fully-masked loop because no
> masked simd clone was available" "vect" { target x86_64-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
> b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
> new file mode 100644
> index 00000000000..081c19bf58f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
> +/* { dg-require-effective-target vect_simd_clones } */
> +/* { dg-additional-options "-fopenmp-simd --param
> vect-partial-vector-usage=1 -fdump-tree-dce6" } */
> +/* { dg-additional-options "-mavx512f" { target avx512f } } */
> +
> +#pragma omp declare simd simdlen(16)
> +int __attribute__((const)) baz (int x);
> +
> +int a[1024];
> +
> +void foo (int n, int * __restrict b)
> +{
> + for (int i = 0; i < n; ++i)
> + if (baz (a[i]))
> + b[i] = baz (b[i]);
> +}
> +
> +/* One notinbranch SIMD call, one inbranch in the main vector loop and two
> + inbranch in the masked epilog. */
> +/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+\\\)" 1
> "dce6" { target avx512f } } } */
> +/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\]
> \\\(\[^,\]\+,\[^,\]\+\\\)" 3 "dce6" { target avx512f } } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a18772f5928..bfed98b8af0 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -4219,9 +4219,12 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> stmt_vec_info stmt_info,
> poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
> unsigned group_size = SLP_TREE_LANES (slp_node);
> unsigned int badness = 0;
> + unsigned int badness_inbranch = 0;
> struct cgraph_node *bestn = NULL;
> + struct cgraph_node *bestn_inbranch = NULL;
> if (!cost_vec)
> - bestn = cgraph_node::get (simd_clone_info[0]);
> + bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> + ? data.clone_inbranch : data.clone);
> else
> for (struct cgraph_node *n = node->simd_clones; n != NULL;
> n = n->simdclone->next_clone)
> @@ -4351,14 +4354,19 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> stmt_vec_info stmt_info,
> SIMD_CLONE_ARG_TYPE_MASK);
> /* Penalize using a masked SIMD clone in a non-masked loop, that is
> not in a branch, as we'd have to construct an all-true mask. */
> - if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> - this_badness += 64;
> + this_badness += 64;
> }
> if (bestn == NULL || this_badness < badness)
> {
> bestn = n;
> badness = this_badness;
> }
> + if (n->simdclone->inbranch
> + && (bestn_inbranch == NULL || this_badness < badness_inbranch))
> + {
> + bestn_inbranch = n;
> + badness_inbranch = this_badness;
> + }
> }
>
> if (bestn == NULL)
> @@ -4394,6 +4402,17 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> stmt_vec_info stmt_info,
> "incompatible vector types for invariants\n");
> return false;
> }
> +
> + if (!bestn_inbranch && loop_vinfo)
> + {
> + if (dump_enabled_p ()
> + && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "can't use a fully-masked loop because no"
> + " masked simd clone was available.\n");
> + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
> +
> /* When the original call is pure or const but the SIMD ABI dictates
> an aggregate return we will have to use a virtual definition and
> in a loop eventually even need to add a virtual PHI. That's
> @@ -4407,45 +4426,41 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> stmt_vec_info stmt_info,
> so automagic virtual operand updating doesn't work. */
> if (gimple_vuse (stmt))
> vinfo->any_known_not_updated_vssa = true;
> - simd_clone_info.safe_push (bestn->decl);
> - for (i = 0; i < bestn->simdclone->nargs; i++)
> - {
> - switch (bestn->simdclone->args[i].arg_type)
> - {
> - default:
> - continue;
> - case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
> - case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
> - {
> - simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
> - simd_clone_info.safe_push (arginfo[i].op);
> - tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
> - ? size_type_node : TREE_TYPE (arginfo[i].op);
> - tree ls = build_int_cst (lst, arginfo[i].linear_step);
> - simd_clone_info.safe_push (ls);
> - tree sll = arginfo[i].simd_lane_linear
> - ? boolean_true_node : boolean_false_node;
> - simd_clone_info.safe_push (sll);
> - }
> - break;
> - case SIMD_CLONE_ARG_TYPE_MASK:
> - if (loop_vinfo
> - && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> - vect_record_loop_mask (loop_vinfo,
> - &LOOP_VINFO_MASKS (loop_vinfo),
> - ncopies_in, vectype, op);
> - break;
> - }
> - }
>
> - if (!bestn->simdclone->inbranch && loop_vinfo)
> - {
> - if (dump_enabled_p ()
> - && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> - dump_printf_loc (MSG_NOTE, vect_location,
> - "can't use a fully-masked loop because a"
> - " non-masked simd clone was selected.\n");
> - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + data.clone = bestn;
> + data.clone_inbranch = bestn_inbranch;
> +
> + simd_clone_info.safe_push (NULL_TREE);
> + for (i = 0;
> + i < (bestn_inbranch ? bestn_inbranch : bestn)->simdclone->nargs; i++)
> + {
> + if (loop_vinfo
> + && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> + && (bestn_inbranch->simdclone->args[i].arg_type
> + == SIMD_CLONE_ARG_TYPE_MASK))
> + vect_record_loop_mask (loop_vinfo,
> + &LOOP_VINFO_MASKS (loop_vinfo),
> + ncopies_in, vectype, op);
> + else if ((bestn->simdclone->args[i].arg_type
> + == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
> + || (bestn->simdclone->args[i].arg_type
> + == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP)
> + || (bestn_inbranch
> + && ((bestn_inbranch->simdclone->args[i].arg_type
> + == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
> + || (bestn_inbranch->simdclone->args[i].arg_type
> + ==
> SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))))
> + {
> + simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
> + simd_clone_info.safe_push (arginfo[i].op);
> + tree lst = (POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
> + ? size_type_node : TREE_TYPE (arginfo[i].op));
> + tree ls = build_int_cst (lst, arginfo[i].linear_step);
> + simd_clone_info.safe_push (ls);
> + tree sll = (arginfo[i].simd_lane_linear
> + ? boolean_true_node : boolean_false_node);
> + simd_clone_info.safe_push (sll);
> + }
> }
>
> SLP_TREE_TYPE (slp_node) = call_simd_clone_vec_info_type;
> @@ -4816,9 +4831,8 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> stmt_vec_info stmt_info,
>
> tree masktype = bestn->simdclone->args[mask_i].vector_type;
> if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
> - /* Guess the number of lanes represented by masktype. */
> callee_nelements = exact_div (bestn->simdclone->simdlen,
> - bestn->simdclone->nargs - nargs);
> +
> bestn->simdclone->args[i].linear_step);
> else
> callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
> o = vector_unroll_factor (nunits, callee_nelements);
> @@ -4828,7 +4842,7 @@ vectorizable_simd_clone_call (vec_info *vinfo,
> stmt_vec_info stmt_info,
> {
> vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
> mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> - ncopies, masktype, j);
> + ncopies_in, vectype, j);
> }
> else
> mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 55f0bee0eb7..3b264a6102c 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -265,8 +265,12 @@ struct vect_simd_clone_data : vect_data {
> vect_simd_clone_data () = default;
> vect_simd_clone_data (vect_simd_clone_data &&other) = default;
>
> + /* Selected SIMD clone and clone for in-branch. */
> + cgraph_node *clone;
> + cgraph_node *clone_inbranch;
> +
> /* Selected SIMD clone's function info. First vector element
> - is SIMD clone's function decl, followed by a pair of trees (base + step)
> + is NULL_TREE, followed by a pair of trees (base + step)
> for linear arguments (pair of NULLs for other arguments). */
> auto_vec<tree> simd_clone_info;
> };
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)